diff options
Diffstat (limited to 'src/core/CL/cl_kernels')
32 files changed, 0 insertions, 8635 deletions
diff --git a/src/core/CL/cl_kernels/absdiff.cl b/src/core/CL/cl_kernels/absdiff.cl deleted file mode 100644 index a09caf5dc5..0000000000 --- a/src/core/CL/cl_kernels/absdiff.cl +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** Calculate the absolute difference of two input images. - * - * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:\n - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short - * - * @param[in] in1_ptr Pointer to the first source image. Supported data types: U8, S16 - * @param[in] in1_stride_x Stride of the first source image in X dimension (in bytes) - * @param[in] in1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in1_stride_y Stride of the first source image in Y dimension (in bytes) - * @param[in] in1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[in] in2_ptr Pointer to the second source image. Supported data types: U8, S16 - * @param[in] in2_stride_x Stride of the second source image in X dimension (in bytes) - * @param[in] in2_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in2_stride_y Stride of the second source image in Y dimension (in bytes) - * @param[in] in2_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the second source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void absdiff( - IMAGE_DECLARATION(in1), - IMAGE_DECLARATION(in2), - IMAGE_DECLARATION(out)) -{ - Image in1 = CONVERT_TO_IMAGE_STRUCT(in1); - Image in2 = CONVERT_TO_IMAGE_STRUCT(in2); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - - VEC_DATA_TYPE(DATA_TYPE_OUT, 16) - in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); - VEC_DATA_TYPE(DATA_TYPE_OUT, 16) - in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); - - vstore16(CONVERT_SAT(abs_diff(in_a, in_b), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); -} diff --git a/src/core/CL/cl_kernels/accumulate.cl b/src/core/CL/cl_kernels/accumulate.cl deleted file mode 100644 index 9e37830f1b..0000000000 --- a/src/core/CL/cl_kernels/accumulate.cl +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This function accumulates an input image into output image. - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] accu_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] accu_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] accu_step_x accu_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] accu_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] accu_step_y accu_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] accu_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void accumulate( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(accu)) -{ - // Get pixels pointer - Image input = CONVERT_TO_IMAGE_STRUCT(input); - Image accu = CONVERT_TO_IMAGE_STRUCT(accu); - - // Load data - uchar16 in_data = vload16(0, input.ptr); - short16 accu_data = vload16(0, (__global short *)accu.ptr); - - // Perform accumulation - short16 res = add_sat(convert_short16(in_data), accu_data); - - // Store result - vstore16(res, 0, (__global short *)accu.ptr); -} - -/** This function accumulates a weighted value from an input image to an output image. - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] accu_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] accu_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] accu_step_x accu_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] accu_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] accu_step_y accu_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] accu_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] alpha The float scalar value with a value in the range of 0 to 1 - */ -__kernel void accumulate_weighted( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(accu), - const float alpha) -{ - // Get pixels pointer - Image input = CONVERT_TO_IMAGE_STRUCT(input); - Image accu = CONVERT_TO_IMAGE_STRUCT(accu); - - // Load data - const float16 in_data = convert_float16(vload16(0, input.ptr)); - const float16 accu_data = convert_float16(vload16(0, accu.ptr)); - - // Calculate weighted accumulation - const uchar16 res = convert_uchar16((1.0f - alpha) * accu_data + alpha * in_data); - - // Store result - vstore16(res, 0, accu.ptr); -} - -/** This function accumulates a squared value from an input image to an output image. - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] accu_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] accu_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] accu_step_x accu_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] accu_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] accu_step_y accu_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] accu_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] shift The U32 scalar value with a value in the range of 0 to 15 - */ -__kernel void accumulate_squared( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(accu), - const uint shift) -{ - // Get pixels pointer - Image input = CONVERT_TO_IMAGE_STRUCT(input); - Image accu = CONVERT_TO_IMAGE_STRUCT(accu); - - // Load data - ushort16 in_data = convert_ushort16(vload16(0, input.ptr)); - uint16 accu_data = convert_uint16(vload16(0, (__global short *)accu.ptr)); - - // Calculate squared accumulation - short16 res = convert_short16_sat(accu_data + convert_uint16((in_data * in_data) >> shift)); - - // Store result - vstore16(res, 0, (__global short *)accu.ptr); -} diff --git a/src/core/CL/cl_kernels/canny.cl b/src/core/CL/cl_kernels/canny.cl deleted file mode 100644 index bcff8438db..0000000000 --- a/src/core/CL/cl_kernels/canny.cl +++ /dev/null @@ -1,454 +0,0 @@ -/* - * Copyright (c) 2017-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** Calculate the magnitude and phase from horizontal and vertical result of sobel result. - * - * @note The calculation of gradient uses level 1 normalisation. - * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short - * - * @param[in] src1_ptr Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32 - * @param[in] src1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src1_step_x src1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src1_step_y src1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src2_ptr Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32 - * @param[in] src2_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src2_step_x src2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src2_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src2_step_y src2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] grad_ptr Pointer to the gradient output. Supported data types: U16, U32 - * @param[in] grad_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] grad_step_x grad_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] grad_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] grad_step_y grad_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] grad_offset_first_element_in_bytes The offset of the first element of the output - * @param[out] angle_ptr Pointer to the angle output. Supported data types: U8 - * @param[in] angle_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] angle_step_x angle_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] angle_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] angle_step_y angle_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] angle_offset_first_element_in_bytes The offset of the first element of the output - */ -__kernel void combine_gradients_L1( - IMAGE_DECLARATION(src1), - IMAGE_DECLARATION(src2), - IMAGE_DECLARATION(grad), - IMAGE_DECLARATION(angle)) -{ - // Construct images - Image src1 = CONVERT_TO_IMAGE_STRUCT(src1); - Image src2 = CONVERT_TO_IMAGE_STRUCT(src2); - Image grad = CONVERT_TO_IMAGE_STRUCT(grad); - Image angle = CONVERT_TO_IMAGE_STRUCT(angle); - - // Load sobel horizontal and vertical values - VEC_DATA_TYPE(DATA_TYPE_IN, 4) - h = vload4(0, (__global DATA_TYPE_IN *)src1.ptr); - VEC_DATA_TYPE(DATA_TYPE_IN, 4) - v = vload4(0, (__global DATA_TYPE_IN *)src2.ptr); - - /* Calculate the gradient, using level 1 normalisation method */ - VEC_DATA_TYPE(DATA_TYPE_OUT, 4) - m = CONVERT_SAT((abs(h) + abs(v)), VEC_DATA_TYPE(DATA_TYPE_OUT, 4)); - - /* Calculate the angle */ - float4 p = 180.0f * atan2pi(convert_float4(v), convert_float4(h)); - - /* Remap angle to range [0, 256) */ - p = select(p, p + 180.0f, p < 0.0f); - - /* Store results */ - vstore4(m, 0, (__global DATA_TYPE_OUT *)grad.ptr); - vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr); -} - -/** Calculate the gradient and angle from horizontal and vertical result of sobel result. - * - * @note The calculation of gradient uses level 2 normalisation - * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short - * - * @param[in] src1_ptr Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32 - * @param[in] src1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src1_step_x src1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src1_step_y src1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src2_ptr Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32 - * @param[in] src2_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src2_step_x src2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src2_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src2_step_y src2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] grad_ptr Pointer to the gradient output. Supported data types: U16, U32 - * @param[in] grad_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] grad_step_x grad_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] grad_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] grad_step_y grad_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] grad_offset_first_element_in_bytes The offset of the first element of the output - * @param[out] angle_ptr Pointer to the angle output. Supported data types: U8 - * @param[in] angle_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] angle_step_x angle_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] angle_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] angle_step_y angle_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] angle_offset_first_element_in_bytes The offset of the first element of the output - */ -__kernel void combine_gradients_L2( - IMAGE_DECLARATION(src1), - IMAGE_DECLARATION(src2), - IMAGE_DECLARATION(grad), - IMAGE_DECLARATION(angle)) -{ - // Construct images - Image src1 = CONVERT_TO_IMAGE_STRUCT(src1); - Image src2 = CONVERT_TO_IMAGE_STRUCT(src2); - Image grad = CONVERT_TO_IMAGE_STRUCT(grad); - Image angle = CONVERT_TO_IMAGE_STRUCT(angle); - - // Load sobel horizontal and vertical values - float4 h = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src1.ptr)); - float4 v = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src2.ptr)); - - /* Calculate the gradient, using level 2 normalisation method */ - float4 m = sqrt(h * h + v * v); - - /* Calculate the angle */ - float4 p = 180.0f * atan2pi(v, h); - - /* Remap angle to range [0, 256) */ - p = select(p, p + 180.0f, p < 0.0f); - - /* Store results */ - vstore4(CONVERT_SAT_ROUND(m, VEC_DATA_TYPE(DATA_TYPE_OUT, 4), rte), 0, (__global DATA_TYPE_OUT *)grad.ptr); - vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr); -} - -#define EDGE 255 -#define NO_EDGE 0 - -/** Array that holds the relative coordinates offset for the neighbouring pixels. - */ -__constant short4 neighbours_coords[] = -{ - { -1, 0, 1, 0 }, // 0 - { -1, -1, 1, 1 }, // 45 - { 0, -1, 0, 1 }, // 90 - { 1, -1, -1, 1 }, // 135 -}; - -/** Perform non maximum suppression. - * - * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short - * - * @param[in] grad_ptr Pointer to the gradient output. Supported data types: S16, S32 - * @param[in] grad_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] grad_step_x grad_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] grad_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] grad_step_y grad_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] grad_offset_first_element_in_bytes The offset of the first element of the output - * @param[in] angle_ptr Pointer to the angle output. Supported data types: U8 - * @param[in] angle_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] angle_step_x angle_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] angle_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] angle_step_y angle_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] angle_offset_first_element_in_bytes TThe offset of the first element of the output - * @param[out] non_max_ptr Pointer to the non maximum suppressed output. Supported data types: U16, U32 - * @param[in] non_max_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] non_max_step_x non_max_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] non_max_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] non_max_step_y non_max_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] non_max_offset_first_element_in_bytes The offset of the first element of the output - * @param[in] lower_thr The low threshold - */ -__kernel void suppress_non_maximum( - IMAGE_DECLARATION(grad), - IMAGE_DECLARATION(angle), - IMAGE_DECLARATION(non_max), - uint lower_thr) -{ - // Construct images - Image grad = CONVERT_TO_IMAGE_STRUCT(grad); - Image angle = CONVERT_TO_IMAGE_STRUCT(angle); - Image non_max = CONVERT_TO_IMAGE_STRUCT(non_max); - - // Index - const int x = get_global_id(0); - const int y = get_global_id(1); - - // Get gradient and angle - DATA_TYPE_IN gradient = *((__global DATA_TYPE_IN *)grad.ptr); - uchar an = *((__global uchar *)angle.ptr); - - // Early return if not greater than lower threshold - if(gradient <= lower_thr) - { - return; - } - - // Divide the whole round into 4 directions - DATA_TYPE_OUT q_an; - - if(an < 22.5f || an >= 157.5f) - { - q_an = 0; - } - else if(an < 67.5f) - { - q_an = 1; - } - else if(an < 112.5f) - { - q_an = 2; - } - else - { - q_an = 3; - } - - // Find the two pixels in the perpendicular direction - short2 x_p = neighbours_coords[q_an].s02; - short2 y_p = neighbours_coords[q_an].s13; - DATA_TYPE_IN g1 = *((global DATA_TYPE_IN *)offset(&grad, x_p.x, y_p.x)); - DATA_TYPE_IN g2 = *((global DATA_TYPE_IN *)offset(&grad, x_p.y, y_p.y)); - - if((gradient > g1) && (gradient > g2)) - { - __global uchar *non_max_addr = non_max_ptr + non_max_offset_first_element_in_bytes + x * non_max_stride_x + y * non_max_stride_y; - *((global DATA_TYPE_OUT *)non_max_addr) = gradient; - } -} - -#define hysteresis_local_stack_L1 8 // The size of level 1 stack. This has to agree with the host side -#define hysteresis_local_stack_L2 16 // The size of level 2 stack, adjust this can impact the match rate with VX implementation - -/** Check whether pixel is valid - * - * Skip the pixel if the early_test fails. - * Otherwise, it tries to add the pixel coordinate to the stack, and proceed to popping the stack instead if the stack is full - * - * @param[in] early_test Boolean condition based on the minv check and visited buffer check - * @param[in] x_pos X-coordinate of pixel that is going to be recorded, has to be within the boundary - * @param[in] y_pos Y-coordinate of pixel that is going to be recorded, has to be within the boundary - * @param[in] x_cur X-coordinate of current central pixel - * @param[in] y_cur Y-coordinate of current central pixel - */ -#define check_pixel(early_test, x_pos, y_pos, x_cur, y_cur) \ - { \ - if(!early_test) \ - { \ - /* Number of elements in the local stack 1, points to next available entry */ \ - c = *((__global char *)offset(&l1_stack_counter, x_cur, y_cur)); \ - \ - if(c > (hysteresis_local_stack_L1 - 1)) /* Stack level 1 is full */ \ - goto pop_stack; \ - \ - /* The pixel that has already been recorded is ignored */ \ - if(!atomic_or((__global uint *)offset(&recorded, x_pos, y_pos), 1)) \ - { \ - l1_ptr[c] = (short2)(x_pos, y_pos); \ - *((__global char *)offset(&l1_stack_counter, x_cur, y_cur)) += 1; \ - } \ - } \ - } - -/** Perform hysteresis. - * - * @attention The input data_type needs to be passed at compile time using -DDATA_TYPE_IN: e.g. -DDATA_TYPE_IN=short - * - * @param[in] src_ptr Pointer to the input image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element of the output - * @param[out] out_ptr Pointer to the output image. Supported data types: U8 - * @param[in] out_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element of the output - * @param[out] visited_ptr Pointer to the visited buffer, where pixels are marked as visited. Supported data types: U32 - * @param[in] visited_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] visited_step_x visited_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] visited_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] visited_step_y visited_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] visited_offset_first_element_in_bytes The offset of the first element of the output - * @param[out] recorded_ptr Pointer to the recorded buffer, where pixels are marked as recorded. Supported data types: U32 - * @param[in] recorded_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] recorded_step_x recorded_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] recorded_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] recorded_step_y recorded_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] recorded_offset_first_element_in_bytes The offset of the first element of the output - * @param[out] l1_stack_ptr Pointer to the l1 stack of a pixel. Supported data types: S32 - * @param[in] l1_stack_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] l1_stack_step_x l1_stack_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] l1_stack_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] l1_stack_step_y l1_stack_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] l1_stack_offset_first_element_in_bytes The offset of the first element of the output - * @param[out] l1_stack_counter_ptr Pointer to the l1 stack counters of an image. Supported data types: U8 - * @param[in] l1_stack_counter_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] l1_stack_counter_step_x l1_stack_counter_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] l1_stack_counter_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] l1_stack_counter_step_y l1_stack_counter_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] l1_stack_counter_offset_first_element_in_bytes The offset of the first element of the output - * @param[in] low_thr The lower threshold - * @param[in] up_thr The upper threshold - * @param[in] width The width of the image. - * @param[in] height The height of the image - */ -kernel void hysteresis( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(out), - IMAGE_DECLARATION(visited), - IMAGE_DECLARATION(recorded), - IMAGE_DECLARATION(l1_stack), - IMAGE_DECLARATION(l1_stack_counter), - uint low_thr, - uint up_thr, - int width, - int height) -{ - // Create images - Image src = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src); - Image out = CONVERT_TO_IMAGE_STRUCT_NO_STEP(out); - Image visited = CONVERT_TO_IMAGE_STRUCT_NO_STEP(visited); - Image recorded = CONVERT_TO_IMAGE_STRUCT_NO_STEP(recorded); - Image l1_stack = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack); - Image l1_stack_counter = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack_counter); - - // Index - int x = get_global_id(0); - int y = get_global_id(1); - - // Load value - DATA_TYPE_IN val = *((__global DATA_TYPE_IN *)offset(&src, x, y)); - - // If the pixel has already been marked as NO_EDGE, store that value in the output and return - if(val == NO_EDGE) - { - *offset(&out, x, y) = NO_EDGE; - return; - } - - // Return if it is a MAYBE pixel. Such pixels will become edges if near a strong edge - if(val <= up_thr) - { - return; - } - - // Init local stack 2 - short2 stack_L2[hysteresis_local_stack_L2] = { 0 }; - int L2_counter = 0; - - // Perform recursive hysteresis - while(true) - { - // Get L1 stack pointer - __global short2 *l1_ptr = (__global short2 *)(l1_stack.ptr + y * l1_stack.stride_y + x * hysteresis_local_stack_L1 * l1_stack.stride_x); - - // If the pixel has already been visited, proceed with the items in the stack instead - if(atomic_or((__global uint *)offset(&visited, x, y), 1) != 0) - { - goto pop_stack; - } - - // Set strong edge - *offset(&out, x, y) = EDGE; - - // If it is the top of stack l2, we don't need check the surrounding pixels - if(L2_counter > (hysteresis_local_stack_L2 - 1)) - { - goto pop_stack2; - } - - // Points to the start of the local stack; - char c; - - VEC_DATA_TYPE(DATA_TYPE_IN, 4) - x_tmp; - uint4 v_tmp; - - // Get direction pixel indices - int N = max(y - 1, 0), S = min(y + 1, height - 2), W = max(x - 1, 0), E = min(x + 1, width - 2); - - // Check 8 pixels around for weak edges where low_thr < val <= up_thr - x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, N)); - v_tmp = vload4(0, (__global uint *)offset(&visited, W, N)); - check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, N, x, y); // NW - check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, N, x, y); // N - check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, N, x, y); // NE - - x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, y)); - v_tmp = vload4(0, (__global uint *)offset(&visited, W, y)); - check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, y, x, y); // W - check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, y, x, y); // E - - x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, S)); - v_tmp = vload4(0, (__global uint *)offset(&visited, W, S)); - check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, S, x, y); // SW - check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, S, x, y); // S - check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, S, x, y); // SE - -#undef check_pixel - -pop_stack: - c = *((__global char *)offset(&l1_stack_counter, x, y)); - - if(c >= 1) - { - *((__global char *)offset(&l1_stack_counter, x, y)) -= 1; - int2 l_c = convert_int2(l1_ptr[c - 1]); - - // Push the current position into level 2 stack - stack_L2[L2_counter].x = x; - stack_L2[L2_counter].y = y; - - x = l_c.x; - y = l_c.y; - - L2_counter++; - - continue; - } - - if(L2_counter > 0) - { - goto pop_stack2; - } - else - { - return; - } - -pop_stack2: - L2_counter--; - x = stack_L2[L2_counter].x; - y = stack_L2[L2_counter].y; - }; -} diff --git a/src/core/CL/cl_kernels/channel_combine.cl b/src/core/CL/cl_kernels/channel_combine.cl deleted file mode 100644 index 550d52e9ea..0000000000 --- a/src/core/CL/cl_kernels/channel_combine.cl +++ /dev/null @@ -1,416 +0,0 @@ -/* - * Copyright (c) 2016-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This function combines three planes to a single RGB image. - * - * @param[in] plane0_ptr Pointer to the first plane. Supported Format: U8 - * @param[in] plane0_stride_x Stride of the first plane in X dimension (in bytes) - * @param[in] plane0_step_x plane0_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane0_stride_y Stride of the first plane in Y dimension (in bytes) - * @param[in] plane0_step_y plane0_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane - * @param[in] plane1_ptr Pointer to the second plane. Supported Format: U8 - * @param[in] plane1_stride_x Stride of the second plane in X dimension (in bytes) - * @param[in] plane1_step_x plane1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane1_stride_y Stride of the second plane in Y dimension (in bytes) - * @param[in] plane1_step_y plane1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane - * @param[in] plane2_ptr Pointer to the third plane. Supported Format: U8 - * @param[in] plane2_stride_x Stride of the third plane in X dimension (in bytes) - * @param[in] plane2_step_x plane2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane2_stride_y Stride of the third plane in Y dimension (in bytes) - * @param[in] plane2_step_y plane2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane - * @param[in] dst_ptr Pointer to the destination image. Supported Format: RGB - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_combine_RGB888( - IMAGE_DECLARATION(plane0), - IMAGE_DECLARATION(plane1), - IMAGE_DECLARATION(plane2), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0); - Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1); - Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data0 = vload16(0, plane0.ptr); - uchar16 data1 = vload16(0, plane1.ptr); - uchar16 data2 = vload16(0, plane2.ptr); - - uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0, - data0.s1, data1.s1, data2.s1, - data0.s2, data1.s2, data2.s2, - data0.s3, data1.s3, data2.s3, - data0.s4, data1.s4, data2.s4, - data0.s5); - vstore16(out0, 0, dst.ptr); - - uchar16 out1 = (uchar16)(data1.s5, data2.s5, data0.s6, - data1.s6, data2.s6, data0.s7, - data1.s7, data2.s7, data0.s8, - data1.s8, data2.s8, data0.s9, - data1.s9, data2.s9, data0.sA, - data1.sA); - vstore16(out1, 0, dst.ptr + 16); - - uchar16 out2 = (uchar16)(data2.sA, data0.sB, data1.sB, - data2.sB, data0.sC, data1.sC, - data2.sC, data0.sD, data1.sD, - data2.sD, data0.sE, data1.sE, - data2.sE, data0.sF, data1.sF, - data2.sF); - vstore16(out2, 0, dst.ptr + 32); -} - -/** This function combines three planes to a single RGBA image. - * - * @param[in] plane0_ptr Pointer to the first plane. Supported Format: U8 - * @param[in] plane0_stride_x Stride of the first plane in X dimension (in bytes) - * @param[in] plane0_step_x plane0_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane0_stride_y Stride of the first plane in Y dimension (in bytes) - * @param[in] plane0_step_y plane0_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane - * @param[in] plane1_ptr Pointer to the second plane. Supported Format: U8 - * @param[in] plane1_stride_x Stride of the second plane in X dimension (in bytes) - * @param[in] plane1_step_x plane1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane1_stride_y Stride of the second plane in Y dimension (in bytes) - * @param[in] plane1_step_y plane1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane - * @param[in] plane2_ptr Pointer to the third plane. Supported Format: U8 - * @param[in] plane2_stride_x Stride of the third plane in X dimension (in bytes) - * @param[in] plane2_step_x plane2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane2_stride_y Stride of the third plane in Y dimension (in bytes) - * @param[in] plane2_step_y plane2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane - * @param[in] plane3_ptr Pointer to the fourth plane. Supported Format: U8 - * @param[in] plane3_stride_x Stride of the fourth plane in X dimension (in bytes) - * @param[in] plane3_step_x plane3_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane3_stride_y Stride of the fourth plane in Y dimension (in bytes) - * @param[in] plane3_step_y plane3_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane3_offset_first_element_in_bytes The offset of the first element in the fourth plane - * @param[in] dst_ptr Pointer to the destination image. Supported Format: RGBA - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_combine_RGBA8888( - IMAGE_DECLARATION(plane0), - IMAGE_DECLARATION(plane1), - IMAGE_DECLARATION(plane2), - IMAGE_DECLARATION(plane3), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0); - Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1); - Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2); - Image plane3 = CONVERT_TO_IMAGE_STRUCT(plane3); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data0 = vload16(0, plane0.ptr); - uchar16 data1 = vload16(0, plane1.ptr); - uchar16 data2 = vload16(0, plane2.ptr); - uchar16 data3 = vload16(0, plane3.ptr); - - uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0, data3.s0, - data0.s1, data1.s1, data2.s1, data3.s1, - data0.s2, data1.s2, data2.s2, data3.s2, - data0.s3, data1.s3, data2.s3, data3.s3); - vstore16(out0, 0, dst.ptr); - - uchar16 out1 = (uchar16)(data0.s4, data1.s4, data2.s4, data3.s4, - data0.s5, data1.s5, data2.s5, data3.s5, - data0.s6, data1.s6, data2.s6, data3.s6, - data0.s7, data1.s7, data2.s7, data3.s7); - vstore16(out1, 0, dst.ptr + 16); - - uchar16 out2 = (uchar16)(data0.s8, data1.s8, data2.s8, data3.s8, - data0.s9, data1.s9, data2.s9, data3.s9, - data0.sA, data1.sA, data2.sA, data3.sA, - data0.sB, data1.sB, data2.sB, data3.sB); - vstore16(out2, 0, dst.ptr + 32); - - uchar16 out3 = (uchar16)(data0.sC, data1.sC, data2.sC, data3.sC, - data0.sD, data1.sD, data2.sD, data3.sD, - data0.sE, data1.sE, data2.sE, data3.sE, - data0.sF, data1.sF, data2.sF, data3.sF); - vstore16(out3, 0, dst.ptr + 48); -} - -/** This function combines three planes to a single YUYV image. - * - * @param[in] plane0_ptr Pointer to the first plane. Supported Format: U8 - * @param[in] plane0_stride_x Stride of the first plane in X dimension (in bytes) - * @param[in] plane0_step_x plane0_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane0_stride_y Stride of the first plane in Y dimension (in bytes) - * @param[in] plane0_step_y plane0_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane - * @param[in] plane1_ptr Pointer to the second plane. Supported Format: U8 - * @param[in] plane1_stride_x Stride of the second plane in X dimension (in bytes) - * @param[in] plane1_step_x plane1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane1_stride_y Stride of the second plane in Y dimension (in bytes) - * @param[in] plane1_step_y plane1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane - * @param[in] plane2_ptr Pointer to the third plane. Supported Format: U8 - * @param[in] plane2_stride_x Stride of the third plane in X dimension (in bytes) - * @param[in] plane2_step_x plane2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane2_stride_y Stride of the third plane in Y dimension (in bytes) - * @param[in] plane2_step_y plane2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane - * @param[in] dst_ptr Pointer to the destination image. Supported Format: YUYV - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_combine_YUYV422( - IMAGE_DECLARATION(plane0), - IMAGE_DECLARATION(plane1), - IMAGE_DECLARATION(plane2), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0); - Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1); - Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data0 = vload16(0, plane0.ptr); - uchar8 data1 = vload8(0, plane1.ptr); - uchar8 data2 = vload8(0, plane2.ptr); - - uchar16 out0 = (uchar16)(data0.s0, data1.s0, data0.s1, data2.s0, - data0.s2, data1.s1, data0.s3, data2.s1, - data0.s4, data1.s2, data0.s5, data2.s2, - data0.s6, data1.s3, data0.s7, data2.s3); - vstore16(out0, 0, dst.ptr); - uchar16 out1 = (uchar16)(data0.s8, data1.s4, data0.s9, data2.s4, - data0.sA, data1.s5, data0.sB, data2.s5, - data0.sC, data1.s6, data0.sD, data2.s6, - data0.sE, data1.s7, data0.sF, data2.s7); - vstore16(out1, 0, dst.ptr + 16); -} - -/** This function combines three planes to a single UYUV image. - * - * @param[in] plane0_ptr Pointer to the first plane. Supported Format: U8 - * @param[in] plane0_stride_x Stride of the first plane in X dimension (in bytes) - * @param[in] plane0_step_x plane0_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane0_stride_y Stride of the first plane in Y dimension (in bytes) - * @param[in] plane0_step_y plane0_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane - * @param[in] plane1_ptr Pointer to the second plane. Supported Format: U8 - * @param[in] plane1_stride_x Stride of the second plane in X dimension (in bytes) - * @param[in] plane1_step_x plane1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane1_stride_y Stride of the second plane in Y dimension (in bytes) - * @param[in] plane1_step_y plane1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane - * @param[in] plane2_ptr Pointer to the third plane. Supported Format: U8 - * @param[in] plane2_stride_x Stride of the third plane in X dimension (in bytes) - * @param[in] plane2_step_x plane2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane2_stride_y Stride of the third plane in Y dimension (in bytes) - * @param[in] plane2_step_y plane2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane - * @param[in] dst_ptr Pointer to the destination image. Supported Format: UYUV - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_combine_UYVY422( - IMAGE_DECLARATION(plane0), - IMAGE_DECLARATION(plane1), - IMAGE_DECLARATION(plane2), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0); - Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1); - Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data0 = vload16(0, plane0.ptr); - uchar8 data1 = vload8(0, plane1.ptr); - uchar8 data2 = vload8(0, plane2.ptr); - - uchar16 out0 = (uchar16)(data1.s0, data0.s0, data2.s0, data0.s1, - data1.s1, data0.s2, data2.s1, data0.s3, - data1.s2, data0.s4, data2.s2, data0.s5, - data1.s3, data0.s6, data2.s3, data0.s7); - vstore16(out0, 0, dst.ptr); - uchar16 out1 = (uchar16)(data1.s4, data0.s8, data2.s4, data0.s9, - data1.s5, data0.sA, data2.s5, data0.sB, - data1.s6, data0.sC, data2.s6, data0.sD, - data1.s7, data0.sE, data2.s7, data0.sF); - vstore16(out1, 0, dst.ptr + 16); -} - -/** This function combines three planes to a single NV12/NV21 image. - * - * @note NV12 or NV21 has to be specified through preprocessor macro. eg. -DNV12 performs NV12 channel combine. - * - * @param[in] src_plane0_ptr Pointer to the first plane. Supported Format: U8 - * @param[in] src_plane0_stride_x Stride of the first plane in X dimension (in bytes) - * @param[in] src_plane0_step_x src_plane0_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_plane0_stride_y Stride of the first plane in Y dimension (in bytes) - * @param[in] src_plane0_step_y src_plane0_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane - * @param[in] src_plane1_ptr Pointer to the second plane. Supported Format: U8 - * @param[in] src_plane1_stride_x Stride of the second plane in X dimension (in bytes) - * @param[in] src_plane1_step_x src_plane1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_plane1_stride_y Stride of the second plane in Y dimension (in bytes) - * @param[in] src_plane1_step_y src_plane1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane - * @param[in] src_plane2_ptr Pointer to the third plane. Supported Format: U8 - * @param[in] src_plane2_stride_x Stride of the third plane in X dimension (in bytes) - * @param[in] src_plane2_step_x src_plane2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_plane2_stride_y Stride of the third plane in Y dimension (in bytes) - * @param[in] src_plane2_step_y src_plane2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane - * @param[in] dst_plane0_ptr Pointer to the first plane of the destination image. Supported Format: U8 - * @param[in] dst_plane0_stride_x Stride of the first plane of the destination image in X dimension (in bytes) - * @param[in] dst_plane0_step_x dst_plane0_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_plane0_stride_y Stride of the first plane of the destination image in Y dimension (in bytes) - * @param[in] dst_plane0_step_y dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image - * @param[in] dst_plane1_ptr Pointer to the second plane of the destination image. Supported Format: UV88 - * @param[in] dst_plane1_stride_x Stride of the second plane of the destination image in X dimension (in bytes) - * @param[in] dst_plane1_step_x dst_plane1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_plane1_stride_y Stride of the second plane of the destination image in Y dimension (in bytes) - * @param[in] dst_plane1_step_y dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image - * @param[in] height Sub-sampled height - */ -__kernel void channel_combine_NV( - IMAGE_DECLARATION(src_plane0), - IMAGE_DECLARATION(src_plane1), - IMAGE_DECLARATION(src_plane2), - IMAGE_DECLARATION(dst_plane0), - IMAGE_DECLARATION(dst_plane1), - uint height) -{ - // Get pixels pointer - Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0); - Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1); - Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2); - Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0); - Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1); - - // Copy plane data - vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr); - vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height)); - - // Create UV place - uchar8 data1 = vload8(0, src_plane1.ptr); - uchar8 data2 = vload8(0, src_plane2.ptr); - -#ifdef NV12 - vstore16(shuffle2(data1, data2, (uchar16)(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)), 0, dst_plane1.ptr); -#elif defined(NV21) - vstore16(shuffle2(data2, data1, (uchar16)(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)), 0, dst_plane1.ptr); -#endif /* NV12 or NV21 */ -} - -/** This function combines three planes to a single YUV444 or IYUV image. - * - * @note YUV444 or IYUV has to be specified through preprocessor macro. eg. -DIYUV performs IYUV channel combine. - * - * @param[in] src_plane0_ptr Pointer to the first plane. Supported Format: U8 - * @param[in] src_plane0_stride_x Stride of the first plane in X dimension (in bytes) - * @param[in] src_plane0_step_x src_plane0_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_plane0_stride_y Stride of the first plane in Y dimension (in bytes) - * @param[in] src_plane0_step_y src_plane0_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane - * @param[in] src_plane1_ptr Pointer to the second plane. Supported Format: U8 - * @param[in] src_plane1_stride_x Stride of the second plane in X dimension (in bytes) - * @param[in] src_plane1_step_x src_plane1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_plane1_stride_y Stride of the second plane in Y dimension (in bytes) - * @param[in] src_plane1_step_y src_plane1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane - * @param[in] src_plane2_ptr Pointer to the third plane. Supported Format: U8 - * @param[in] src_plane2_stride_x Stride of the third plane in X dimension (in bytes) - * @param[in] src_plane2_step_x src_plane2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_plane2_stride_y Stride of the third plane in Y dimension (in bytes) - * @param[in] src_plane2_step_y src_plane2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane - * @param[in] dst_plane0_ptr Pointer to the first plane of the destination image. Supported Format: U8 - * @param[in] dst_plane0_stride_x Stride of the first plane of the destination image in X dimension (in bytes) - * @param[in] dst_plane0_step_x dst_plane0_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_plane0_stride_y Stride of the first plane of the destination image in Y dimension (in bytes) - * @param[in] dst_plane0_step_y dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image - * @param[in] dst_plane1_ptr Pointer to the second plane of the destination image. Supported Format: U8 - * @param[in] dst_plane1_stride_x Stride of the second plane of the destination image in X dimension (in bytes) - * @param[in] dst_plane1_step_x dst_plane1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_plane1_stride_y Stride of the second plane of the destination image in Y dimension (in bytes) - * @param[in] dst_plane1_step_y dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image - * @param[in] dst_plane2_ptr Pointer to the third plane of the destination image. Supported Format: U8 - * @param[in] dst_plane2_stride_x Stride of the third plane of the destination image in X dimension (in bytes) - * @param[in] dst_plane2_step_x dst_plane2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_plane2_stride_y Stride of the third plane of the destination image in Y dimension (in bytes) - * @param[in] dst_plane2_step_y dst_plane2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_plane2_offset_first_element_in_bytes The offset of the first element in the third plane of the destination image - * @param[in] height Sub-sampled height - */ -__kernel void copy_planes_3p( - IMAGE_DECLARATION(src_plane0), - IMAGE_DECLARATION(src_plane1), - IMAGE_DECLARATION(src_plane2), - IMAGE_DECLARATION(dst_plane0), - IMAGE_DECLARATION(dst_plane1), - IMAGE_DECLARATION(dst_plane2), - uint height) -{ - // Get pixels pointer - Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0); - Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1); - Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2); - Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0); - Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1); - Image dst_plane2 = CONVERT_TO_IMAGE_STRUCT(dst_plane2); - - // Copy plane data - vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr); -#ifdef YUV444 - vstore16(vload16(0, src_plane1.ptr), 0, dst_plane1.ptr); - vstore16(vload16(0, src_plane2.ptr), 0, dst_plane2.ptr); -#elif defined(IYUV) - vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height)); - vstore8(vload8(0, src_plane1.ptr), 0, dst_plane1.ptr); - vstore8(vload8(0, src_plane2.ptr), 0, dst_plane2.ptr); -#endif /* YUV444 or IYUV */ -} diff --git a/src/core/CL/cl_kernels/channel_extract.cl b/src/core/CL/cl_kernels/channel_extract.cl deleted file mode 100644 index b64f24814e..0000000000 --- a/src/core/CL/cl_kernels/channel_extract.cl +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Copyright (c) 2016-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This function extracts a given channel from an RGB image. - * - * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel. - * - * @param[in] src_ptr Pointer to the source image. Supported Format: RGB - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_extract_RGB888( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data = vload16(0, src.ptr); - uchar8 data2 = vload8(0, src.ptr + 16); - -#ifdef CHANNEL_R - vstore4(data.s0369, 0, dst.ptr); - vstore4((uchar4)(data.sCF, data2.s25), 0, dst.ptr + 4); -#elif defined(CHANNEL_G) - vstore4(data.s147A, 0, dst.ptr); - vstore4((uchar4)(data.sD, data2.s036), 0, dst.ptr + 4); -#elif defined(CHANNEL_B) - vstore4(data.s258B, 0, dst.ptr); - vstore4((uchar4)(data.sE, data2.s147), 0, dst.ptr + 4); -#endif /* CHANNEL_R or CHANNEL_G or CHANNEL_B */ -} - -/** This function extracts a given channel from an RGBA image. - * - * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel. - * - * @param[in] src_ptr Pointer to the source image. Supported Format: RGBA - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_extract_RGBA8888( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data = vload16(0, src.ptr); - uchar16 data2 = vload16(0, src.ptr + 16); - -#ifdef CHANNEL_R - vstore8((uchar8)(data.s048C, data2.s048C), 0, dst.ptr); -#elif defined(CHANNEL_G) - vstore8((uchar8)(data.s159D, data2.s159D), 0, dst.ptr); -#elif defined(CHANNEL_B) - vstore8((uchar8)(data.s26AE, data2.s26AE), 0, dst.ptr); -#elif defined(CHANNEL_A) - vstore8((uchar8)(data.s37BF, data2.s37BF), 0, dst.ptr); -#endif /* CHANNEL_R or CHANNEL_G or CHANNEL_B or CHANNEL_A */ -} - -/** This function extracts a given channel from an YUYV image. - * - * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel. - * - * @param[in] src_ptr Pointer to the source image. Supported Format: YUYV - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_extract_YUYV422( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data = vload16(0, src.ptr); - -#ifdef CHANNEL_Y - vstore8(data.s02468ACE, 0, dst.ptr); -#elif defined(CHANNEL_U) - vstore4(data.s159D, 0, dst.ptr); -#elif defined(CHANNEL_V) - vstore4(data.s37BF, 0, dst.ptr); -#endif /* CHANNEL_Y or CHANNEL_U or CHANNEL_V */ -} - -/** This function extracts a given channel from an UYUV image. - * - * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel. - * - * @param[in] src_ptr Pointer to the source image. Supported Format: UYUV - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_extract_UYVY422( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data = vload16(0, src.ptr); - -#ifdef CHANNEL_Y - vstore8(data.s13579BDF, 0, dst.ptr); -#elif defined(CHANNEL_U) - vstore4(data.s048C, 0, dst.ptr); -#elif defined(CHANNEL_V) - vstore4(data.s26AE, 0, dst.ptr); -#endif /* CHANNEL_Y or CHANNEL_U or CHANNEL_V */ -} - -/** This function extracts a given channel from an NV12 image. - * - * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel. - * @warning Only channels UV can be extracted using this kernel. - * - * @param[in] src_ptr Pointer to the source image. Supported Format: NV12 (UV88) - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_extract_NV12( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data = vload16(0, src.ptr); - -#ifdef CHANNEL_U - vstore8(data.s02468ACE, 0, dst.ptr); -#elif defined(CHANNEL_V) - vstore8(data.s13579BDF, 0, dst.ptr); -#endif /* CHANNEL_U or CHANNEL_V */ -} - -/** This function extracts a given channel from an NV21 image. - * - * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel. - * @warning Only channels UV can be extracted using this kernel. - * - * @param[in] src_ptr Pointer to the source image. Supported Format: NV21 (UV88) - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_extract_NV21( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data = vload16(0, src.ptr); - -#ifdef CHANNEL_U - vstore8(data.s13579BDF, 0, dst.ptr); -#elif defined(CHANNEL_V) - vstore8(data.s02468ACE, 0, dst.ptr); -#endif /* CHANNEL_U or CHANNEL_V */ -} - -/** This function extracts a given plane from an multi-planar image. - * - * @param[in] src_ptr Pointer to the source image. Supported Format: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void copy_plane( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Copy plane data - vstore8(vload8(0, src.ptr), 0, dst.ptr); -} diff --git a/src/core/CL/cl_kernels/color_convert.cl b/src/core/CL/cl_kernels/color_convert.cl deleted file mode 100644 index cbebc88668..0000000000 --- a/src/core/CL/cl_kernels/color_convert.cl +++ /dev/null @@ -1,1911 +0,0 @@ -/* - * Copyright (c) 2016-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** Convert an RGB888 image to RGBX8888 - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void RGB888_to_RGBA8888_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out = CONVERT_TO_IMAGE_STRUCT(output); - - // handle 16 pixels every time - uchar16 rgb_0 = vload16(0, in.ptr); - uchar16 rgb_1 = vload16(0, in.ptr + 16); - uchar16 rgb_2 = vload16(0, in.ptr + 32); - - uchar16 rgba_0 = (uchar16)(rgb_0.s012, 255, rgb_0.s345, 255, rgb_0.s678, 255, rgb_0.s9ab, 255); - uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.sf, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255); - uchar16 rgba_2 = (uchar16)(rgb_1.s89a, 255, rgb_1.sbcd, 255, rgb_1.sef, rgb_2.s0, 255, rgb_2.s123, 255); - uchar16 rgba_3 = (uchar16)(rgb_2.s456, 255, rgb_2.s789, 255, rgb_2.sabc, 255, rgb_2.sdef, 255); - - vstore16(rgba_0, 0, out.ptr); - vstore16(rgba_1, 0, out.ptr + 16); - vstore16(rgba_2, 0, out.ptr + 32); - vstore16(rgba_3, 0, out.ptr + 48); -} - -/** Convert an RGB888 image to U8 - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: RGB888 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void RGB888_to_U8_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out = CONVERT_TO_IMAGE_STRUCT(output); - - // handle 16 pixels every time - const uchar16 rgb_0 = vload16(0, in.ptr); - const uchar16 rgb_1 = vload16(0, in.ptr + 16); - const uchar16 rgb_2 = vload16(0, in.ptr + 32); - - //Resequence values from a sequence of 16 RGB values to sequence of 16 R, 16 G, 16 B values - const uchar16 rgb_r = (uchar16)(rgb_0.s0369, rgb_0.scf, rgb_1.s258b, rgb_1.se, rgb_2.s147a, rgb_2.sd); - const uchar16 rgb_g = (uchar16)(rgb_0.s147a, rgb_0.sd, rgb_1.s0369, rgb_1.scf, rgb_2.s258b, rgb_2.se); - const uchar16 rgb_b = (uchar16)(rgb_0.s258b, rgb_0.se, rgb_1.s147a, rgb_1.sd, rgb_2.s0369, rgb_2.scf); - - const float16 rgb2u8_red_coef_bt709 = 0.2126f; - const float16 rgb2u8_green_coef_bt709 = 0.7152f; - const float16 rgb2u8_blue_coef_bt709 = 0.0722f; - - //Computation of 16 greyscale values in float - const float16 greyscale_f_0 = rgb2u8_red_coef_bt709 * convert_float16(rgb_r) + rgb2u8_green_coef_bt709 * convert_float16(rgb_g) + rgb2u8_blue_coef_bt709 * convert_float16(rgb_b); - - //Convert it to 16 grayscale uchar values - const uchar16 greyscale_u8_0 = convert_uchar16_sat_rtz(greyscale_f_0); - - vstore16(greyscale_u8_0, 0, out.ptr); -} - -/** Convert an RGB888 image to RGBX8888 - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void RGBA8888_to_RGB888_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out = CONVERT_TO_IMAGE_STRUCT(output); - // handle 16 pixels every time - uchar16 rgba_0 = vload16(0, in.ptr); - uchar16 rgba_1 = vload16(0, in.ptr + 16); - uchar16 rgba_2 = vload16(0, in.ptr + 32); - uchar16 rgba_3 = vload16(0, in.ptr + 48); - - uchar16 rgb_0 = (uchar16)(rgba_0.s01245689, rgba_0.sacde, rgba_1.s0124); - uchar16 rgb_1 = (uchar16)(rgba_1.s5689acde, rgba_2.s01245689); - uchar16 rgb_2 = (uchar16)(rgba_2.sacde, rgba_3.s01245689, rgba_3.sacde); - - vstore16(rgb_0, 0, out.ptr); - vstore16(rgb_1, 0, out.ptr + 16); - vstore16(rgb_2, 0, out.ptr + 32); -} - -/** Convert a UYVY422 image to RGB888 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void UYVY422_to_RGB888_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out = CONVERT_TO_IMAGE_STRUCT(output); - - // handle 8 pixels every time - uchar16 uyvy = vload16(0, in.ptr); - - uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf); - char8 cb = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128); - char8 cr = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128); - - float8 red_coef_bt709 = (float8)(1.5748f); - float8 green_coef_bt709 = (float8)(-0.1873f); - float8 green_coef2_bt709 = (float8)(-0.4681f); - float8 blue_coef_bt709 = (float8)(1.8556f); - float8 lumav = convert_float8(luma); - - float8 f_r = red_coef_bt709 * convert_float8(cr); - float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr); - float8 f_b = blue_coef_bt709 * convert_float8(cb); - - f_r += lumav; - f_g += lumav; - f_b += lumav; - - uchar8 r_0 = convert_uchar8_sat_rtz(f_r); - uchar8 g_0 = convert_uchar8_sat_rtz(f_g); - uchar8 b_0 = convert_uchar8_sat_rtz(f_b); - - uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2, - r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5); - uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7); - - vstore16(rgb_0, 0, out.ptr); - vstore8(rgb_1, 0, out.ptr + 16); -} - -/** Convert a UYVY422 image to RGBX8888 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void UYVY422_to_RGBA8888_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out = CONVERT_TO_IMAGE_STRUCT(output); - - // handle 8 pixels every time - uchar16 uyvy = vload16(0, in.ptr); - - uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf); - char8 cb = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128); - char8 cr = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128); - - float8 red_coef_bt709 = (float8)(1.5748f); - float8 green_coef_bt709 = (float8)(-0.1873f); - float8 green_coef2_bt709 = (float8)(-0.4681f); - float8 blue_coef_bt709 = (float8)(1.8556f); - float8 lumav = convert_float8(luma); - - float8 f_r = red_coef_bt709 * convert_float8(cr); - float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr); - float8 f_b = blue_coef_bt709 * convert_float8(cb); - - f_r += lumav; - f_g += lumav; - f_b += lumav; - - uchar8 r_0 = convert_uchar8_sat_rtz(f_r); - uchar8 g_0 = convert_uchar8_sat_rtz(f_g); - uchar8 b_0 = convert_uchar8_sat_rtz(f_b); - - uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255, - r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); - uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255, - r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255); - - vstore16(rgba_0, 0, out.ptr); - vstore16(rgba_1, 0, out.ptr + 16); -} - -/** Convert a YUYV422 image to RGB888 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void YUYV422_to_RGB888_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out = CONVERT_TO_IMAGE_STRUCT(output); - - // handle 8 pixels every time - uchar16 uyvy = vload16(0, in.ptr); - - uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se); - char8 cb = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128); - char8 cr = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128); - - float8 red_coef_bt709 = (float8)(1.5748f); - float8 green_coef_bt709 = (float8)(-0.1873f); - float8 green_coef2_bt709 = (float8)(-0.4681f); - float8 blue_coef_bt709 = (float8)(1.8556f); - float8 lumav = convert_float8(luma); - - float8 f_r = red_coef_bt709 * convert_float8(cr); - float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr); - float8 f_b = blue_coef_bt709 * convert_float8(cb); - - f_r += lumav; - f_g += lumav; - f_b += lumav; - - uchar8 r_0 = convert_uchar8_sat_rtz(f_r); - uchar8 g_0 = convert_uchar8_sat_rtz(f_g); - uchar8 b_0 = convert_uchar8_sat_rtz(f_b); - - uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2, - r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5); - uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7); - - vstore16(rgb_0, 0, out.ptr); - vstore8(rgb_1, 0, out.ptr + 16); -} - -/** Convert a YUYV422 image to RGBX8888 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void YUYV422_to_RGBA8888_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out = CONVERT_TO_IMAGE_STRUCT(output); - - // handle 8 pixels every time - uchar16 uyvy = vload16(0, in.ptr); - - uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se); - char8 cb = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128); - char8 cr = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128); - - float8 red_coef_bt709 = (float8)(1.5748f); - float8 green_coef_bt709 = (float8)(-0.1873f); - float8 green_coef2_bt709 = (float8)(-0.4681f); - float8 blue_coef_bt709 = (float8)(1.8556f); - float8 lumav = convert_float8(luma); - - float8 f_r = red_coef_bt709 * convert_float8(cr); - float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr); - float8 f_b = blue_coef_bt709 * convert_float8(cb); - - f_r += lumav; - f_g += lumav; - f_b += lumav; - - uchar8 r_0 = convert_uchar8_sat_rtz(f_r); - uchar8 g_0 = convert_uchar8_sat_rtz(f_g); - uchar8 b_0 = convert_uchar8_sat_rtz(f_b); - - uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255, - r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); - uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255, - r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255); - - vstore16(rgba_0, 0, out.ptr); - vstore16(rgba_1, 0, out.ptr + 16); -} - -/** Convert a RGB image to NV12 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_step_x luma_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_step_y luma_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_offset_first_element_in_bytes The offset of the first element in the destination image luma channel - * @param[out] uv_ptr Pointer to the destination uv channel. Supported Format: U8 - * @param[in] uv_stride_x Stride of the destination uv channel in X dimension (in bytes) - * @param[in] uv_step_x uv_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] uv_step_y uv_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_offset_first_element_in_bytes The offset of the first element in the destination image uv channel - * - */ -__kernel void RGB888_to_NV12_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(luma), - IMAGE_DECLARATION(uv)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma); - Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv); - - // handle 4 pixels every time, two lines, each line for 2 pixels - // Read 2 pixel of the first line - uchar8 rgb_0 = vload8(0, in.ptr); - uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s3); - uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s4); - uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s5); - - float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0); - float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0); - float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0); - - short2 i_y = convert_short2_rtz(f_y); - short2 i_u = convert_short2_rtz(f_u) + (short2)(128); - short2 i_v = convert_short2_rtz(f_v) + (short2)(128); - - uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); - vstore2(luma_0, 0, out_y.ptr); - - uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); - uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); - - // Read 2 pixel of the second line - uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y); - uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s3); - uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s4); - uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s5); - - f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1); - f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1); - f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1); - - i_y = convert_short2_rtz(f_y); - i_u = convert_short2_rtz(f_u) + (short2)(128); - i_v = convert_short2_rtz(f_v) + (short2)(128); - - uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); - vstore2(luma_1, 0, out_y.ptr + luma_stride_y); - - uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); - uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); - uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4), - ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4)); - - vstore2(cbcr, 0, out_uv.ptr); -} - -/* - R'= Y' + 0.0000*U + 1.5748*V - G'= Y' - 0.1873*U - 0.4681*V - B'= Y' + 1.8556*U + 0.0000*V -*/ - -/** Convert an NV12 image to RGB888 - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 - * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) - * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] rgb_output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void NV12_to_RGB888_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(uv_input), - IMAGE_DECLARATION(rgb_output)) -{ - Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); - Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output); - - // handle 8 pixels every time, two lines, each line for 4 pixels - uchar4 luma_0 = vload4(0, in_luma.ptr); - uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y); - uchar4 cbcr = vload4(0, in_uv.ptr); - char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128); - char4 cr = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128); - - float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); - float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); - float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); - - float4 f_r = convert_float4(luma_0) + temp0; - float4 f_g = convert_float4(luma_0) + temp1; - float4 f_b = convert_float4(luma_0) + temp2; - - uchar4 r_0 = convert_uchar4_sat_rtz(f_r); - uchar4 g_0 = convert_uchar4_sat_rtz(f_g); - uchar4 b_0 = convert_uchar4_sat_rtz(f_b); - - uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); - uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); - vstore8(rgb_0, 0, out_rgb.ptr); - vstore4(rgb_1, 0, out_rgb.ptr + 8); - - f_r = convert_float4(luma_1) + temp0; - f_g = convert_float4(luma_1) + temp1; - f_b = convert_float4(luma_1) + temp2; - - r_0 = convert_uchar4_sat_rtz(f_r); - g_0 = convert_uchar4_sat_rtz(f_g); - b_0 = convert_uchar4_sat_rtz(f_b); - - rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); - rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); - vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y); - vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8); -} - -/** Convert a RGB image to YUV444 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] rgb_input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] rgb_input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] rgb_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgb_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] rgb_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgb_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination image V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - * - */ -__kernel void RGB888_to_YUV444_bt709( - IMAGE_DECLARATION(rgb_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - // handle 4 pixels every time - Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // Read 4 pixel - uchar16 rgb_0 = vload16(0, in_rgb.ptr); - uchar4 r_0 = (uchar4)(rgb_0.s0, rgb_0.s3, rgb_0.s6, rgb_0.s9); - uchar4 g_0 = (uchar4)(rgb_0.s1, rgb_0.s4, rgb_0.s7, rgb_0.sa); - uchar4 b_0 = (uchar4)(rgb_0.s2, rgb_0.s5, rgb_0.s8, rgb_0.sb); - - float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0); - float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0); - float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0); - - short4 i_y = convert_short4_rtz(f_y); - short4 i_u = convert_short4_rtz(f_u) + (short4)(128); - short4 i_v = convert_short4_rtz(f_v) + (short4)(128); - - uchar4 luma_0 = convert_uchar4(max((short4)(0), min(i_y, (short4)(255)))); - vstore4(luma_0, 0, out_y.ptr); - - uchar4 cb_0 = convert_uchar4(max((short4)(0), min(i_u, (short4)(255)))); - uchar4 cr_0 = convert_uchar4(max((short4)(0), min(i_v, (short4)(255)))); - vstore4(cb_0, 0, out_u.ptr); - vstore4(cr_0, 0, out_v.ptr); -} - -/** Convert a RGB image to IYUV using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 2), height ] - * No offset. - * - * @param[in] rgb_input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] rgb_input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] rgb_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgb_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] rgb_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgb_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - * - */ -__kernel void RGB888_to_IYUV_bt709( - IMAGE_DECLARATION(rgb_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - // handle 4 pixels every time, two lines, each line for 2 pixels - Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // Read 2 pixel of the first line - uchar8 rgb_0 = vload8(0, in_rgb.ptr); - uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s3); - uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s4); - uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s5); - - float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0); - float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0); - float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0); - - short2 i_y = convert_short2_rtz(f_y); - short2 i_u = convert_short2_rtz(f_u) + (short2)(128); - short2 i_v = convert_short2_rtz(f_v) + (short2)(128); - - uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); - vstore2(luma_0, 0, out_y.ptr); - - uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); - uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); - - // Read 2 pixel of the second line - uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgb_input_stride_y); - uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s3); - uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s4); - uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s5); - - f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1); - f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1); - f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1); - - i_y = convert_short2_rtz(f_y); - i_u = convert_short2_rtz(f_u) + (short2)(128); - i_v = convert_short2_rtz(f_v) + (short2)(128); - - uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); - vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y); - - uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); - uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); - uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4), - ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4)); - *out_u.ptr = cbcr.x; - *out_v.ptr = cbcr.y; -} - -/** Convert a RGBA image to YUV444 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] rgba_input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] rgba_input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] rgba_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgba_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] rgba_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgba_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination image V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - * - */ -__kernel void RGBA8888_to_YUV444_bt709( - IMAGE_DECLARATION(rgba_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - // handle 4 pixels every time - Image in_rgba = CONVERT_TO_IMAGE_STRUCT(rgba_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // Read 4 pixel - uchar16 rgb_0 = vload16(0, in_rgba.ptr); - uchar4 r_0 = (uchar4)(rgb_0.s0, rgb_0.s4, rgb_0.s8, rgb_0.sc); - uchar4 g_0 = (uchar4)(rgb_0.s1, rgb_0.s5, rgb_0.s9, rgb_0.sd); - uchar4 b_0 = (uchar4)(rgb_0.s2, rgb_0.s6, rgb_0.sa, rgb_0.se); - - float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0); - float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0); - float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0); - - short4 i_y = convert_short4(f_y); - short4 i_u = convert_short4(f_u) + (short4)(128); - short4 i_v = convert_short4(f_v) + (short4)(128); - - uchar4 luma_0 = convert_uchar4_sat(max((short4)(0), min(i_y, (short4)(255)))); - vstore4(luma_0, 0, out_y.ptr); - - uchar4 cb_0 = convert_uchar4_sat(max((short4)(0), min(i_u, (short4)(255)))); - uchar4 cr_0 = convert_uchar4_sat(max((short4)(0), min(i_v, (short4)(255)))); - vstore4(cb_0, 0, out_u.ptr); - vstore4(cr_0, 0, out_v.ptr); -} - -/** Convert a RGBA image to NV12 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 2), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination image luma channel - * @param[out] uv_output_ptr Pointer to the destination uv channel. Supported Format: U8 - * @param[in] uv_output_stride_x Stride of the destination uv channel in X dimension (in bytes) - * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_output_stride_y Stride of the destination image uv channel in Y dimension (in bytes) - * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination image uv channel - * - */ -__kernel void RGBA8888_to_NV12_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(uv_output)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output); - - // Read 2 pixel of the first line - uchar8 rgb_0 = vload8(0, in.ptr); - uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s4); - uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s5); - uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s6); - - float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0); - float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0); - float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0); - - short2 i_y = convert_short2_rtz(f_y); - short2 i_u = convert_short2_rtz(f_u) + (short2)(128); - short2 i_v = convert_short2_rtz(f_v) + (short2)(128); - - uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); - vstore2(luma_0, 0, out_y.ptr); - - uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); - uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); - - // Read 2 pixel of the second line - uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y); - uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s4); - uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s5); - uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s6); - - f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1); - f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1); - f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1); - - i_y = convert_short2_rtz(f_y); - i_u = convert_short2_rtz(f_u) + (short2)(128); - i_v = convert_short2_rtz(f_v) + (short2)(128); - - uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); - vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y); - - uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); - uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); - uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4), - ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4)); - vstore2(cbcr, 0, out_uv.ptr); -} - -/** Convert a RGBA image to IYUV using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 2), height ] - * No offset. - * - * @param[in] rgba_input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] rgba_input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] rgba_input_step_x rgba_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgba_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] rgba_input_step_y rgba_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgba_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - * - */ -__kernel void RGBA8888_to_IYUV_bt709( - IMAGE_DECLARATION(rgba_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - // handle 4 pixels every time, two lines, each line for 2 pixels - Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // Read 2 pixel of the first line - uchar8 rgb_0 = vload8(0, in_rgb.ptr); - uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s4); - uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s5); - uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s6); - - float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0); - float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0); - float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0); - - short2 i_y = convert_short2_rtz(f_y); - short2 i_u = convert_short2_rtz(f_u) + (short2)(128); - short2 i_v = convert_short2_rtz(f_v) + (short2)(128); - - uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); - vstore2(luma_0, 0, out_y.ptr); - - uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); - uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); - - // Read 2 pixel of the second line - uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgba_input_stride_y); - uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s4); - uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s5); - uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s6); - - f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1); - f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1); - f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1); - - i_y = convert_short2_rtz(f_y); - i_u = convert_short2_rtz(f_u) + (short2)(128); - i_v = convert_short2_rtz(f_v) + (short2)(128); - - uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); - vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y); - - uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); - uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); - uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4), - ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4)); - *out_u.ptr = cbcr.x; - *out_v.ptr = cbcr.y; -} - -/** Convert an NV12 image to RGB8888 - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 - * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) - * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] rgb_output_step_y rgb_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void NV12_to_RGBA8888_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(uv_input), - IMAGE_DECLARATION(rgb_output)) -{ - Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); - Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output); - - uchar4 luma_0 = vload4(0, in_luma.ptr); - uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y); - uchar4 cbcr = vload4(0, in_uv.ptr); - char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128); - char4 cr = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128); - - float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); - float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); - float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); - - float4 f_r = convert_float4(luma_0) + temp0; - float4 f_g = convert_float4(luma_0) + temp1; - float4 f_b = convert_float4(luma_0) + temp2; - - uchar4 r_0 = convert_uchar4_sat_rtz(f_r); - uchar4 g_0 = convert_uchar4_sat_rtz(f_g); - uchar4 b_0 = convert_uchar4_sat_rtz(f_b); - - uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); - uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); - vstore8(rgb_0, 0, out_rgb.ptr); - vstore8(rgb_1, 0, out_rgb.ptr + 8); - - f_r = convert_float4(luma_1) + temp0; - f_g = convert_float4(luma_1) + temp1; - f_b = convert_float4(luma_1) + temp2; - - r_0 = convert_uchar4_sat_rtz(f_r); - g_0 = convert_uchar4_sat_rtz(f_g); - b_0 = convert_uchar4_sat_rtz(f_b); - - rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); - rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); - vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y); - vstore8(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8); -} - -/** Convert an NV12 image to IYUV - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 - * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) - * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - */ -__kernel void NV12_to_IYUV_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(uv_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // handle 32 pixels every time, two lines, each line for 16 pixels - uchar16 luma_0 = vload16(0, in_y.ptr); - uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); - uchar16 cbcr = vload16(0, in_uv.ptr); - uchar8 cb = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se); - uchar8 cr = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf); - - vstore16(luma_0, 0, out_y.ptr); - vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); - vstore8(cb, 0, out_u.ptr); - vstore8(cr, 0, out_v.ptr); -} - -/** Convert an NV12 image to YUV444 - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 - * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) - * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - */ -__kernel void NV12_to_YUV444_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(uv_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // handle 32 pixels every time, two lines, each line for 16 pixels - uchar16 luma_0 = vload16(0, in_y.ptr); - uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); - uchar16 cbcr = vload16(0, in_uv.ptr); - uchar16 cb = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8, - cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se); - uchar16 cr = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9, - cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf); - - vstore16(luma_0, 0, out_y.ptr); - vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); - vstore16(cb, 0, out_u.ptr); - vstore16(cb, 0, out_u.ptr + u_output_stride_y); - vstore16(cr, 0, out_v.ptr); - vstore16(cr, 0, out_v.ptr + v_output_stride_y); -} - -/** Convert an NV21 image to RGB888 - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 - * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) - * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] rgb_output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void NV21_to_RGB888_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(uv_input), - IMAGE_DECLARATION(rgb_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); - Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output); - - // handle 8 pixels every time, two lines, each line for 4 pixels - uchar4 luma_0 = vload4(0, in_y.ptr); - uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y); - uchar4 cbcr = vload4(0, in_uv.ptr); - char4 cr = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128); - char4 cb = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128); - - float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); - float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); - float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); - - float4 f_r = convert_float4(luma_0) + temp0; - float4 f_g = convert_float4(luma_0) + temp1; - float4 f_b = convert_float4(luma_0) + temp2; - - uchar4 r_0 = convert_uchar4_sat_rtz(f_r); - uchar4 g_0 = convert_uchar4_sat_rtz(f_g); - uchar4 b_0 = convert_uchar4_sat_rtz(f_b); - - uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); - uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); - vstore8(rgb_0, 0, out_rgb.ptr); - vstore4(rgb_1, 0, out_rgb.ptr + 8); - - f_r = convert_float4(luma_1) + temp0; - f_g = convert_float4(luma_1) + temp1; - f_b = convert_float4(luma_1) + temp2; - - r_0 = convert_uchar4_sat_rtz(f_r); - g_0 = convert_uchar4_sat_rtz(f_g); - b_0 = convert_uchar4_sat_rtz(f_b); - - rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); - rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); - vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y); - vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8); -} - -/** Convert an NV12 image to RGB8888 - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 - * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) - * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] rgba_output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] rgba_output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] rgba_output_step_x rgba_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgba_output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] rgba_output_step_y rgba_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void NV21_to_RGBA8888_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(uv_input), - IMAGE_DECLARATION(rgba_output)) -{ - Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); - Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output); - - // handle 8 pixels every time, two lines, each line for 4 pixels - uchar4 luma_0 = vload4(0, in_luma.ptr); - uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y); - uchar4 cbcr = vload4(0, in_uv.ptr); - char4 cr = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128); - char4 cb = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128); - - float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); - float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); - float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); - - float4 f_r = convert_float4(luma_0) + temp0; - float4 f_g = convert_float4(luma_0) + temp1; - float4 f_b = convert_float4(luma_0) + temp2; - - uchar4 r_0 = convert_uchar4_sat_rtz(f_r); - uchar4 g_0 = convert_uchar4_sat_rtz(f_g); - uchar4 b_0 = convert_uchar4_sat_rtz(f_b); - - uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); - uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); - vstore8(rgb_0, 0, out_rgb.ptr); - vstore8(rgb_1, 0, out_rgb.ptr + 8); - - f_r = convert_float4(luma_1) + temp0; - f_g = convert_float4(luma_1) + temp1; - f_b = convert_float4(luma_1) + temp2; - - r_0 = convert_uchar4_sat_rtz(f_r); - g_0 = convert_uchar4_sat_rtz(f_g); - b_0 = convert_uchar4_sat_rtz(f_b); - - rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); - rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); - vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y); - vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8); -} - -/** Convert an NV21 image to YUV444 - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 - * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) - * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - */ -__kernel void NV21_to_YUV444_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(uv_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // handle 32 pixels every time, two lines, each line for 16 pixels - uchar16 luma_0 = vload16(0, in_y.ptr); - uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); - uchar16 cbcr = vload16(0, in_uv.ptr); - uchar16 cr = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8, - cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se); - uchar16 cb = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9, - cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf); - - vstore16(luma_0, 0, out_y.ptr); - vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); - vstore16(cb, 0, out_u.ptr); - vstore16(cb, 0, out_u.ptr + u_output_stride_y); - vstore16(cr, 0, out_v.ptr); - vstore16(cr, 0, out_v.ptr + v_output_stride_y); -} - -/** Convert an NV21 image to IYUV - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 - * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) - * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - */ -__kernel void NV21_to_IYUV_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(uv_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - uchar16 luma_0 = vload16(0, in_y.ptr); - uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); - uchar16 cbcr = vload16(0, in_uv.ptr); - uchar8 cr = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se); - uchar8 cb = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf); - - vstore16(luma_0, 0, out_y.ptr); - vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); - vstore8(cb, 0, out_u.ptr); - vstore8(cr, 0, out_v.ptr); -} - -/** Convert a UYVY image to IYUV using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * No offset. - * - * @param[in] uyvy_input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] uyvy_input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] uyvy_input_step_x uyvy_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uyvy_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uyvy_input_step_y uyvy_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uyvy_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - * - */ -__kernel void UYVY422_to_IYUV_bt709( - IMAGE_DECLARATION(uyvy_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - Image in_uyvy = CONVERT_TO_IMAGE_STRUCT(uyvy_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // handle 16 pixels every time, each line 8 pixels - uchar16 uyvy = vload16(0, in_uyvy.ptr); - uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf); - ushort4 cb_0 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc); - ushort4 cr_0 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se); - vstore8(luma, 0, out_y.ptr); - - uyvy = vload16(0, in_uyvy.ptr + uyvy_input_stride_y); - luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf); - ushort4 cb_1 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc); - ushort4 cr_1 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se); - vstore8(luma, 0, out_y.ptr + luma_output_stride_y); - - uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2)); - uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2)); - vstore4(cb, 0, out_u.ptr); - vstore4(cr, 0, out_v.ptr); -} - -/** Convert a YUYV image to IYUV using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * No offset. - * - * @param[in] yuyv_input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] yuyv_input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] yuyv_input_step_x yuyv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] yuyv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] yuyv_input_step_y yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] yuyv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - * - */ -__kernel void YUYV422_to_IYUV_bt709( - IMAGE_DECLARATION(yuyv_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // handle 16 pixels every time, each line 8 pixels - uchar16 yuyv = vload16(0, in_yuyv.ptr); - uchar8 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se); - ushort4 cb_0 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd); - ushort4 cr_0 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf); - vstore8(luma, 0, out_y.ptr); - - yuyv = vload16(0, in_yuyv.ptr + yuyv_input_stride_y); - luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se); - ushort4 cb_1 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd); - ushort4 cr_1 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf); - vstore8(luma, 0, out_y.ptr + luma_output_stride_y); - - uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2)); - uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2)); - vstore4(cb, 0, out_u.ptr); - vstore4(cr, 0, out_v.ptr); -} - -/** Convert an IYUV image to RGB888 - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8 - * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes) - * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel - * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8 - * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes) - * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes) - * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel - * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] rgb_output_step_y rgb_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void IYUV_to_RGB888_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(u_input), - IMAGE_DECLARATION(v_input), - IMAGE_DECLARATION(rgb_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input); - Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input); - Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output); - - // handle 8 pixels every time, two lines, each line for 4 pixels - uchar4 luma_0 = vload4(0, in_y.ptr); - uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y); - uchar4 cbcr = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr)); - char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128); - char4 cr = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128); - - float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); - float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); - float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); - - float4 f_r = convert_float4(luma_0) + temp0; - float4 f_g = convert_float4(luma_0) + temp1; - float4 f_b = convert_float4(luma_0) + temp2; - - uchar4 r_0 = convert_uchar4_sat_rtz(f_r); - uchar4 g_0 = convert_uchar4_sat_rtz(f_g); - uchar4 b_0 = convert_uchar4_sat_rtz(f_b); - - uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); - uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); - vstore8(rgb_0, 0, out_rgb.ptr); - vstore4(rgb_1, 0, out_rgb.ptr + 8); - - f_r = convert_float4(luma_1) + temp0; - f_g = convert_float4(luma_1) + temp1; - f_b = convert_float4(luma_1) + temp2; - - r_0 = convert_uchar4_sat_rtz(f_r); - g_0 = convert_uchar4_sat_rtz(f_g); - b_0 = convert_uchar4_sat_rtz(f_b); - - rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); - rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); - vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y); - vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8); -} - -/** Convert an IYUV image to RGB8888 - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8 - * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes) - * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel - * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8 - * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes) - * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes) - * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel - * @param[out] rgba_output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] rgba_output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] rgba_output_step_x rgba_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgba_output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] rgba_output_step_y rgba_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void IYUV_to_RGBA8888_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(u_input), - IMAGE_DECLARATION(v_input), - IMAGE_DECLARATION(rgba_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input); - Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input); - Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output); - - // handle 8 pixels every time, two lines, each line for 4 pixels - uchar4 luma_0 = vload4(0, in_y.ptr); - uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y); - uchar4 cbcr = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr)); - char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128); - char4 cr = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128); - - float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); - float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); - float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); - - float4 f_r = convert_float4(luma_0) + temp0; - float4 f_g = convert_float4(luma_0) + temp1; - float4 f_b = convert_float4(luma_0) + temp2; - - uchar4 r_0 = convert_uchar4_sat_rtz(f_r); - uchar4 g_0 = convert_uchar4_sat_rtz(f_g); - uchar4 b_0 = convert_uchar4_sat_rtz(f_b); - - uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); - uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); - vstore8(rgb_0, 0, out_rgb.ptr); - vstore8(rgb_1, 0, out_rgb.ptr + 8); - - f_r = convert_float4(luma_1) + temp0; - f_g = convert_float4(luma_1) + temp1; - f_b = convert_float4(luma_1) + temp2; - - r_0 = convert_uchar4_sat_rtz(f_r); - g_0 = convert_uchar4_sat_rtz(f_g); - b_0 = convert_uchar4_sat_rtz(f_b); - - rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); - rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); - vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y); - vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8); -} - -/** Convert an IYUV image to YUV444 - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8 - * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes) - * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel - * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8 - * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes) - * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes) - * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - * - */ -__kernel void IYUV_to_YUV444_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(u_input), - IMAGE_DECLARATION(v_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input); - Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // handle 32 pixels every time, two lines, each line for 16 pixels - uchar16 luma_0 = vload16(0, in_y.ptr); - uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); - uchar8 cb_src = vload8(0, in_u.ptr); - uchar8 cr_src = vload8(0, in_v.ptr); - uchar16 cb = (uchar16)(cb_src.s0, cb_src.s0, cb_src.s1, cb_src.s1, cb_src.s2, cb_src.s2, cb_src.s3, cb_src.s3, - cb_src.s4, cb_src.s4, cb_src.s5, cb_src.s5, cb_src.s6, cb_src.s6, cb_src.s7, cb_src.s7); - uchar16 cr = (uchar16)(cr_src.s0, cr_src.s0, cr_src.s1, cr_src.s1, cr_src.s2, cr_src.s2, cr_src.s3, cr_src.s3, - cr_src.s4, cr_src.s4, cr_src.s5, cr_src.s5, cr_src.s6, cr_src.s6, cr_src.s7, cr_src.s7); - - vstore16(luma_0, 0, out_y.ptr); - vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); - vstore16(cb, 0, out_u.ptr); - vstore16(cb, 0, out_u.ptr + u_output_stride_y); - vstore16(cr, 0, out_v.ptr); - vstore16(cr, 0, out_v.ptr + v_output_stride_y); -} - -/** Convert an IYUV image to NV12 - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8 - * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes) - * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel - * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8 - * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes) - * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes) - * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] uv_output_ptr Pointer to the destination UV channel. Supported Format: U8 - * @param[in] uv_output_stride_x Stride of the destination UV channel in X dimension (in bytes) - * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination UV channel - * - */ -__kernel void IYUV_to_NV12_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(u_input), - IMAGE_DECLARATION(v_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(uv_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input); - Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output); - - // handle 32 pixels every time, two lines, each line for 16 pixels - uchar16 luma_0 = vload16(0, in_y.ptr); - uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); - uchar8 cb = vload8(0, in_u.ptr); - uchar8 cr = vload8(0, in_v.ptr); - uchar16 cbcr = (uchar16)(cb.s0, cr.s0, cb.s1, cr.s1, cb.s2, cr.s2, cb.s3, cr.s3, cb.s4, cr.s4, cb.s5, cr.s5, cb.s6, - cr.s6, cb.s7, cr.s7); - - vstore16(luma_0, 0, out_y.ptr); - vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); - vstore16(cbcr, 0, out_uv.ptr); -} - -/** Convert a YUYV image to NV12 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * No offset. - * - * @param[in] yuyv_input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] yuyv_input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] yuyv_input_step_x yuyv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] yuyv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] yuyv_input_step_y yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] yuyv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] uv_output_ptr Pointer to the destination UV channel. Supported Format: U8 - * @param[in] uv_output_stride_x Stride of the destination UV channel in X dimension (in bytes) - * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_output_stride_y Stride of the destination image UV channel in Y dimension (in bytes) - * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination UV channel - * - */ -__kernel void YUYV422_to_NV12_bt709( - IMAGE_DECLARATION(yuyv_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(uv_output)) -{ - Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output); - - // handle 16 pixels every time, each line 8 pixels - uchar16 yuyv = vload16(0, in_yuyv.ptr); - ushort8 cbcr_0 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf); - uchar8 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se); - vstore8(luma, 0, out_y.ptr); - - yuyv = vload16(0, in_yuyv.ptr + yuyv_input_stride_y); - ushort8 cbcr_1 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf); - luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se); - vstore8(luma, 0, out_y.ptr + luma_output_stride_y); - - uchar8 cbcr = convert_uchar8((cbcr_0 + cbcr_1) / (ushort8)(2)); - vstore8(cbcr, 0, out_uv.ptr); -} - -/** Convert a UYVY image to NV12 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] input_uyvy_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_uyvy_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_uyvy_step_x input_uyvy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_uyvy_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_uyvy_step_y input_uyvy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_uyvy_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_step_x luma_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_step_y luma_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_offset_first_element_in_bytes The offset of the first element in the destination image luma channel - * @param[out] uv_ptr Pointer to the destination uv channel. Supported Format: U8 - * @param[in] uv_stride_x Stride of the destination uv channel in X dimension (in bytes) - * @param[in] uv_step_x uv_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] uv_step_y uv_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_offset_first_element_in_bytes The offset of the first element in the destination image uv channel - * - */ -__kernel void UYVY422_to_NV12_bt709( - IMAGE_DECLARATION(input_uyvy), - IMAGE_DECLARATION(luma), - IMAGE_DECLARATION(uv)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input_uyvy); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma); - Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv); - - // handle 16 pixels every time, each line 8 pixels - const uchar16 uyvy_t = vload16(0, in.ptr); - vstore8(uyvy_t.s13579bdf, 0, out_y.ptr); - - const uchar16 uyvy_b = vload16(0, in.ptr + input_uyvy_stride_y); - vstore8(uyvy_b.s13579bdf, 0, out_y.ptr + luma_stride_y); - - const ushort8 cbcr_t = (ushort8)(uyvy_t.s0, uyvy_t.s2, uyvy_t.s4, uyvy_t.s6, uyvy_t.s8, uyvy_t.sa, uyvy_t.sc, uyvy_t.se); - const ushort8 cbcr_b = (ushort8)(uyvy_b.s0, uyvy_b.s2, uyvy_b.s4, uyvy_b.s6, uyvy_b.s8, uyvy_b.sa, uyvy_b.sc, uyvy_b.se); - const uchar8 cbcr = convert_uchar8((cbcr_t + cbcr_b) / (ushort8)(2)); - vstore8(cbcr, 0, out_uv.ptr); -} diff --git a/src/core/CL/cl_kernels/convolution3x3.cl b/src/core/CL/cl_kernels/convolution3x3.cl deleted file mode 100644 index 7bca567b11..0000000000 --- a/src/core/CL/cl_kernels/convolution3x3.cl +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2016-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#ifndef DATA_TYPE -#define DATA_TYPE short -#endif /* DATA_TYPE */ - -#ifndef DATA_TYPE_OUT -#define DATA_TYPE_OUT uchar -#endif /* DATA_TYPE_OUT */ - -/** Compute a 1D horizontal convolution of size 3 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). - * - * @param[in] left_pixel Pointer to the left pixel. - * @param[in] left_coeff Weight of the left pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] right_coeff Weight of the right pixel - * - * @return a short8 containing 8 convoluted values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution1x3(__global const uchar *left_pixel, - const short left_coeff, - const short middle_coeff, - const short right_coeff) -{ - uchar16 temp = vload16(0, left_pixel); - VEC_DATA_TYPE(DATA_TYPE, 8) - left = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - middle = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8)); - - return left * (VEC_DATA_TYPE(DATA_TYPE, 8))left_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right * (VEC_DATA_TYPE(DATA_TYPE, 8))right_coeff; -} - -/** Apply a 3x3 convolution matrix to a single channel U8 input image and return the result. - * - * Convolution matrix layout: - * - * [ mat0, mat1, mat2 ]\n - * [ mat3, mat4, mat5 ]\n - * [ mat6, mat7, mat8 ]\n - * - * @param[in] src A pointer to source Image structure - * @param[in] mat0 Coefficient from the convolution matrix - * @param[in] mat1 Coefficient from the convolution matrix - * @param[in] mat2 Coefficient from the convolution matrix - * @param[in] mat3 Coefficient from the convolution matrix - * @param[in] mat4 Coefficient from the convolution matrix - * @param[in] mat5 Coefficient from the convolution matrix - * @param[in] mat6 Coefficient from the convolution matrix - * @param[in] mat7 Coefficient from the convolution matrix - * @param[in] mat8 Coefficient from the convolution matrix - * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0) - * - * @return a short8 containing 8 convoluted and scaled values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution3x3( - Image *src, - const short mat0, const short mat1, const short mat2, - const short mat3, const short mat4, const short mat5, - const short mat6, const short mat7, const short mat8, uint scale) -{ - // Output pixels - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels; - - // Row 0 - pixels = convolution1x3(offset(src, -1, -1), mat0, mat1, mat2); - // Row - pixels += convolution1x3(offset(src, -1, 0), mat3, mat4, mat5); - // Row 2 - pixels += convolution1x3(offset(src, -1, 1), mat6, mat7, mat8); - - // Divide by the scale - return pixels / (VEC_DATA_TYPE(DATA_TYPE, 8))scale; -} - -#ifndef DYNAMIC_MATRIX_CONVOLUTION - -/** Apply a 3x3 static convolution matrix to a single channel U8 input image and output a single channel image. - * - * @attention The matrix coefficients(MAT0, MAT1, ... MAT8, SCALE), DATA_TYPE, and DATA_TYPE_OUT need to be passed at compile time.\n - * e.g. -DMAT0=1 -DMAT2=2, ...-DMAT8=8, -DSCALE=1, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int - * - * @param[in] src_ptr Pointer to the source image - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution3x3_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels = convolution3x3(&src, - MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, SCALE); - - // Store the result as is in dst - vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); -} - -#endif // DYNAMIC_MATRIX_CONVOLUTION diff --git a/src/core/CL/cl_kernels/convolution5x5.cl b/src/core/CL/cl_kernels/convolution5x5.cl deleted file mode 100644 index 9995ebfa90..0000000000 --- a/src/core/CL/cl_kernels/convolution5x5.cl +++ /dev/null @@ -1,287 +0,0 @@ -/* - * Copyright (c) 2016-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#ifndef DATA_TYPE -#define DATA_TYPE short -#endif /* DATA_TYPE */ - -#ifndef COMPUTE_TYPE -#define COMPUTE_TYPE int -#endif /* COMPUTE_TYPE */ - -#ifndef DATA_TYPE_OUT -#define DATA_TYPE_OUT uchar -#endif /* DATA_TYPE_OUT */ - -/** Compute a 1D horizontal convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). - * - * @param[in] left_pixel Pointer to the left pixel - * @param[in] left1_coeff Weight of the most left pixel - * @param[in] left2_coeff Weight of the left pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] right1_coeff Weight of the right pixel - * @param[in] right2_coeff Weight of the most right pixel - * - * @return a short8 containing 8 convoluted values. - */ -VEC_DATA_TYPE(DATA_TYPE, 8) -convolution1x5( - __global const uchar *left_pixel, - const short left1_coeff, - const short left2_coeff, - const short middle_coeff, - const short right1_coeff, - const short right2_coeff) -{ - uchar16 temp = vload16(0, left_pixel); - - VEC_DATA_TYPE(DATA_TYPE, 8) - left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - middle = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right1 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right2 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8)); - - return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff - + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff; -} - -/** Compute a 1D vertical convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). - * - * @param[in] src Pointer to source image. - * @param[in] up1_coeff Weight of the most up pixel - * @param[in] up2_coeff Weight of the up pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] down1_coeff Weight of the down pixel - * @param[in] down2_coeff Weight of the most down pixel - * - * @return a short8 containing 8 convoluted values. - */ -VEC_DATA_TYPE(COMPUTE_TYPE, 8) -convolution5x1( - Image *src, - const short up1_coeff, - const short up2_coeff, - const short middle_coeff, - const short down1_coeff, - const short down2_coeff) -{ - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - val; - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff; - - return out; -} - -/** Apply a 5x5 convolution matrix to a single channel U8 input image and return the result. - * - * Convolution matrix layout:\n - * [ mat0, mat1, mat2, mat3 , mat4 ]\n - * [ mat5, mat6, mat7, mat8, mat9 ]\n - * [ mat10, mat11, mat12, mat13, mat14 ]\n - * [ mat15, mat16, mat17, mat18, mat19 ]\n - * [ mat20, mat21, mat22, mat23, mat24 ] - * - * @param[in] src A pointer to source Image structure. - * @param[in] mat0 Coefficient from the convolution matrix - * @param[in] mat1 Coefficient from the convolution matrix - * @param[in] mat2 Coefficient from the convolution matrix - * @param[in] mat3 Coefficient from the convolution matrix - * @param[in] mat4 Coefficient from the convolution matrix - * @param[in] mat5 Coefficient from the convolution matrix - * @param[in] mat6 Coefficient from the convolution matrix - * @param[in] mat7 Coefficient from the convolution matrix - * @param[in] mat8 Coefficient from the convolution matrix - * @param[in] mat9 Coefficient from the convolution matrix - * @param[in] mat10 Coefficient from the convolution matrix - * @param[in] mat11 Coefficient from the convolution matrix - * @param[in] mat12 Coefficient from the convolution matrix - * @param[in] mat13 Coefficient from the convolution matrix - * @param[in] mat14 Coefficient from the convolution matrix - * @param[in] mat15 Coefficient from the convolution matrix - * @param[in] mat16 Coefficient from the convolution matrix - * @param[in] mat17 Coefficient from the convolution matrix - * @param[in] mat18 Coefficient from the convolution matrix - * @param[in] mat19 Coefficient from the convolution matrix - * @param[in] mat20 Coefficient from the convolution matrix - * @param[in] mat21 Coefficient from the convolution matrix - * @param[in] mat22 Coefficient from the convolution matrix - * @param[in] mat23 Coefficient from the convolution matrix - * @param[in] mat24 Coefficient from the convolution matrix - * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0) - * - * @return a short8 containing 8 convoluted and scaled values. - */ -short8 convolution5x5( - Image *src, - const short mat0, const short mat1, const short mat2, const short mat3, const short mat4, - const short mat5, const short mat6, const short mat7, const short mat8, const short mat9, - const short mat10, const short mat11, const short mat12, const short mat13, const short mat14, - const short mat15, const short mat16, const short mat17, const short mat18, const short mat19, - const short mat20, const short mat21, const short mat22, const short mat23, const short mat24, - uint scale) -{ - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels; - - pixels = convolution1x5(offset(src, -2, -2), mat0, mat1, mat2, mat3, mat4); - pixels += convolution1x5(offset(src, -2, -1), mat5, mat6, mat7, mat8, mat9); - pixels += convolution1x5(offset(src, -2, 0), mat10, mat11, mat12, mat13, mat14); - pixels += convolution1x5(offset(src, -2, 1), mat15, mat16, mat17, mat18, mat19); - pixels += convolution1x5(offset(src, -2, 2), mat20, mat21, mat22, mat23, mat24); - - if(scale > 0) - { - pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale; - } - - return convert_short8_sat(pixels); -} - -#ifndef DYNAMIC_MATRIX_CONVOLUTION - -/** Apply a 1x5 static convolution matrix to a single channel U8 input image and output a single temporary channel image(Support U16, S16, S32). - * - * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4) and DATA_TYPE need to be passed at compile time:\n - * e.g. -DMAT0=1 -DMAT2=2, -DMAT3=3, -DMAT4=4, -DDATA_TYPE=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U16, S16, S32 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution_separable1x5_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Output pixels - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels = convolution1x5(offset(&src, -2, 0), MAT0, MAT1, MAT2, MAT3, MAT4); - - // Store result in dst - vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr); -} - -/** Apply a 5x1 static convolution matrix to a single channel U8 input image and output a single channel image. - * - * @attention The matrix coefficients (MAT5, MAT6, MAT7, MAT8, MAT9, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n - * e.g. -DMAT5=1 -DMAT6=2, -DMAT7=3, -DMAT8=4, -DMAT9=5, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U16, S16, S32 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution_separable5x1_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Output pixels - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - pixels = convolution5x1(&src, MAT5, MAT6, MAT7, MAT8, MAT9); - - // Divide by the scale - pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE; - - // Store result in dst - vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); -} - -/** Apply a static 5x5 convolution matrix to a single channel U8 input image and output a single channel image including borders - * - * @attention The matrix coefficients(MAT0, MAT1, ... MAT24, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n - * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT24=24, -DSCALE=6, -DDATA_TYPE_OUT=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution5x5_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - short8 pixels = convolution5x5(&src, - MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, - MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, SCALE); - - // Store the result as is in dst - vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); -} - -#endif // DYNAMIC_MATRIX_CONVOLUTION diff --git a/src/core/CL/cl_kernels/convolution7x7.cl b/src/core/CL/cl_kernels/convolution7x7.cl deleted file mode 100644 index 50fb3d7f35..0000000000 --- a/src/core/CL/cl_kernels/convolution7x7.cl +++ /dev/null @@ -1,338 +0,0 @@ -/* - * Copyright (c) 2016-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#ifndef DATA_TYPE -#define DATA_TYPE short -#endif /* DATA_TYPE */ - -#ifndef COMPUTE_TYPE -#define COMPUTE_TYPE int -#endif /* COMPUTE_TYPE */ - -#ifndef DATA_TYPE_OUT -#define DATA_TYPE_OUT uchar -#endif /* DATA_TYPE_OUT */ - -/** Compute a 1D horizontal convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). - * - * @param[in] left_pixel Pointer to the left pixel - * @param[in] left1_coeff Weight of the most left pixel - * @param[in] left2_coeff Weight of the second left pixel - * @param[in] left3_coeff Weight of the left pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] right1_coeff Weight of the right pixel - * @param[in] right2_coeff Weight of the second right pixel - * @param[in] right3_coeff Weight of the most right pixel - * - * @return a short8 containing 8 convoluted values. - */ -VEC_DATA_TYPE(DATA_TYPE, 8) -convolution1x7( - __global const uchar *left_pixel, - const short left1_coeff, - const short left2_coeff, - const short left3_coeff, - const short middle_coeff, - const short right1_coeff, - const short right2_coeff, - const short right3_coeff) -{ - uchar16 temp = vload16(0, left_pixel); - - VEC_DATA_TYPE(DATA_TYPE, 8) - left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - middle = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right1 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right2 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right3 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8)); - - return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, - 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff; -} - -/** Compute a 1D vertical convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). - * - * @param[in] src Pointer to source image. - * @param[in] up1_coeff Weight of the most up pixel - * @param[in] up2_coeff Weight of the second up pixel - * @param[in] up3_coeff Weight of the up pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] down1_coeff Weight of the down pixel - * @param[in] down2_coeff Weight of the second down pixel - * @param[in] down3_coeff Weight of the third down pixel - * - * @return a short8 containing 8 convoluted values. - */ -VEC_DATA_TYPE(COMPUTE_TYPE, 8) -convolution7x1( - Image *src, - const short up1_coeff, - const short up2_coeff, - const short up3_coeff, - const short middle_coeff, - const short down1_coeff, - const short down2_coeff, - const short down3_coeff) -{ - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - val; - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff; - - return out; -} - -/** Apply a 7x7 convolution matrix to a single channel U8 input image and return the result. - * - * Convolution matrix layout:\n - * [ mat0, mat1, mat2, mat3 , mat4, mat5, mat6 ]\n - * [ mat7, mat8, mat9, mat10, mat11, mat12, mat13 ]\n - * [ mat14, mat15, mat16, mat17, mat18, mat19, mat20 ]\n - * [ mat21, mat22, mat23, mat24, mat25, mat26, mat27 ]\n - * [ mat28, mat29, mat30, mat31, mat32, mat33, mat34 ]\n - * [ mat35, mat36, mat37, mat38, mat39, mat40, mat41 ]\n - * [ mat42, mat43, mat44, mat45, mat46, mat47, mat48 ] - * - * @param[in] src A pointer to source Image structure. - * @param[in] mat0 Coefficient from the convolution matrix - * @param[in] mat1 Coefficient from the convolution matrix - * @param[in] mat2 Coefficient from the convolution matrix - * @param[in] mat3 Coefficient from the convolution matrix - * @param[in] mat4 Coefficient from the convolution matrix - * @param[in] mat5 Coefficient from the convolution matrix - * @param[in] mat6 Coefficient from the convolution matrix - * @param[in] mat7 Coefficient from the convolution matrix - * @param[in] mat8 Coefficient from the convolution matrix - * @param[in] mat9 Coefficient from the convolution matrix - * @param[in] mat10 Coefficient from the convolution matrix - * @param[in] mat11 Coefficient from the convolution matrix - * @param[in] mat12 Coefficient from the convolution matrix - * @param[in] mat13 Coefficient from the convolution matrix - * @param[in] mat14 Coefficient from the convolution matrix - * @param[in] mat15 Coefficient from the convolution matrix - * @param[in] mat16 Coefficient from the convolution matrix - * @param[in] mat17 Coefficient from the convolution matrix - * @param[in] mat18 Coefficient from the convolution matrix - * @param[in] mat19 Coefficient from the convolution matrix - * @param[in] mat20 Coefficient from the convolution matrix - * @param[in] mat21 Coefficient from the convolution matrix - * @param[in] mat22 Coefficient from the convolution matrix - * @param[in] mat23 Coefficient from the convolution matrix - * @param[in] mat24 Coefficient from the convolution matrix - * @param[in] mat25 Coefficient from the convolution matrix - * @param[in] mat26 Coefficient from the convolution matrix - * @param[in] mat27 Coefficient from the convolution matrix - * @param[in] mat28 Coefficient from the convolution matrix - * @param[in] mat29 Coefficient from the convolution matrix - * @param[in] mat30 Coefficient from the convolution matrix - * @param[in] mat31 Coefficient from the convolution matrix - * @param[in] mat32 Coefficient from the convolution matrix - * @param[in] mat33 Coefficient from the convolution matrix - * @param[in] mat34 Coefficient from the convolution matrix - * @param[in] mat35 Coefficient from the convolution matrix - * @param[in] mat36 Coefficient from the convolution matrix - * @param[in] mat37 Coefficient from the convolution matrix - * @param[in] mat38 Coefficient from the convolution matrix - * @param[in] mat39 Coefficient from the convolution matrix - * @param[in] mat40 Coefficient from the convolution matrix - * @param[in] mat41 Coefficient from the convolution matrix - * @param[in] mat42 Coefficient from the convolution matrix - * @param[in] mat43 Coefficient from the convolution matrix - * @param[in] mat44 Coefficient from the convolution matrix - * @param[in] mat45 Coefficient from the convolution matrix - * @param[in] mat46 Coefficient from the convolution matrix - * @param[in] mat47 Coefficient from the convolution matrix - * @param[in] mat48 Coefficient from the convolution matrix - * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0) - * - */ -short8 convolution7x7( - Image *src, - const short mat0, const short mat1, const short mat2, const short mat3, const short mat4, - const short mat5, const short mat6, const short mat7, const short mat8, const short mat9, - const short mat10, const short mat11, const short mat12, const short mat13, const short mat14, - const short mat15, const short mat16, const short mat17, const short mat18, const short mat19, - const short mat20, const short mat21, const short mat22, const short mat23, const short mat24, - const short mat25, const short mat26, const short mat27, const short mat28, const short mat29, - const short mat30, const short mat31, const short mat32, const short mat33, const short mat34, - const short mat35, const short mat36, const short mat37, const short mat38, const short mat39, - const short mat40, const short mat41, const short mat42, const short mat43, const short mat44, - const short mat45, const short mat46, const short mat47, const short mat48, uint scale) -{ - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels; - - pixels = convolution1x7(offset(src, -3, -3), mat0, mat1, mat2, mat3, mat4, mat5, mat6); - pixels += convolution1x7(offset(src, -3, -2), mat7, mat8, mat9, mat10, mat11, mat12, mat13); - pixels += convolution1x7(offset(src, -3, -1), mat14, mat15, mat16, mat17, mat18, mat19, mat20); - pixels += convolution1x7(offset(src, -3, 0), mat21, mat22, mat23, mat24, mat25, mat26, mat27); - pixels += convolution1x7(offset(src, -3, 1), mat28, mat29, mat30, mat31, mat32, mat33, mat34); - pixels += convolution1x7(offset(src, -3, 2), mat35, mat36, mat37, mat38, mat39, mat40, mat41); - pixels += convolution1x7(offset(src, -3, 3), mat42, mat43, mat44, mat45, mat46, mat47, mat48); - - if(scale > 0) - { - pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale; - } - - return convert_short8_sat(pixels); -} - -#ifndef DYNAMIC_MATRIX_CONVOLUTION - -/** Apply a 1x7 static convolution matrix to a single channel U8 input image and output a single temporary channel image. - * - * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6) and DATA_TYPE need to be passed at compile time:\n - * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT6=6, -DDATA_TYPE=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U16, S16, S32 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution_separable1x7_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Output pixels - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels = convolution1x7(offset(&src, -3, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6); - - // Store result in dst - vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr); -} - -/** Apply a 7x1 static convolution matrix to a single channel U8 input image and output a single channel image. - * - * @attention The matrix coefficients (MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n - * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT24=13, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U16, S16, S32 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution_separable7x1_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Output pixels - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - pixels = convolution7x1(&src, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13); - - // Divide by the scale - pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE; - - // Store result in dst - vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); -} - -/** Apply a static 7x7 convolution matrix to a single channel U8 input image and output a single channel U8 image including the borders. - * - * @attention The matrix coefficients(MAT0, MAT1, ... MAT48, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n - * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT48=48, -DSCALE=6, -DDATA_TYPE_OUT=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution7x7_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - short8 pixels = convolution7x7(&src, - MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, - MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25, - MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37, - MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, SCALE); - - // Clamp results to [ 0, 255 ] and store them in dst - vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); -} - -#endif // DYNAMIC_MATRIX_CONVOLUTION diff --git a/src/core/CL/cl_kernels/convolution9x9.cl b/src/core/CL/cl_kernels/convolution9x9.cl deleted file mode 100644 index 7e77c61fea..0000000000 --- a/src/core/CL/cl_kernels/convolution9x9.cl +++ /dev/null @@ -1,403 +0,0 @@ -/* - * Copyright (c) 2016-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#ifndef DATA_TYPE -#define DATA_TYPE short -#endif /* DATA_TYPE */ - -#ifndef COMPUTE_TYPE -#define COMPUTE_TYPE int -#endif /* COMPUTE_TYPE */ - -#ifndef DATA_TYPE_OUT -#define DATA_TYPE_OUT uchar -#endif /* DATA_TYPE_OUT */ - -/** Compute a 1D horizontal convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). - * - * @param[in] left_pixel Pointer to the left pixel - * @param[in] left1_coeff Weight of the most left pixel - * @param[in] left2_coeff Weight of the second left pixel - * @param[in] left3_coeff Weight of the third left pixel - * @param[in] left4_coeff Weight of the left pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] right1_coeff Weight of the right pixel - * @param[in] right2_coeff Weight of the second right pixel - * @param[in] right3_coeff Weight of the third right pixel - * @param[in] right4_coeff Weight of the most right pixel - * - * @return a short8 containing 8 convoluted values. - */ -VEC_DATA_TYPE(DATA_TYPE, 8) -convolution1x9( - __global const uchar *left_pixel, - const short left1_coeff, - const short left2_coeff, - const short left3_coeff, - const short left4_coeff, - const short middle_coeff, - const short right1_coeff, - const short right2_coeff, - const short right3_coeff, - const short right4_coeff) -{ - uchar16 temp = vload16(0, left_pixel); - - VEC_DATA_TYPE(DATA_TYPE, 8) - left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - left4 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - middle = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right1 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right2 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right3 = CONVERT(temp.s789abcde, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right4 = CONVERT(temp.s89abcdef, VEC_DATA_TYPE(DATA_TYPE, 8)); - - return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + left4 * (VEC_DATA_TYPE(DATA_TYPE, - 8))left4_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, - 8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff + right4 * (VEC_DATA_TYPE(DATA_TYPE, 8))right4_coeff; -} - -/** Compute a 1D vertical convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). - * - * @param[in] src Pointer to source image. - * @param[in] up1_coeff Weight of the most up pixel - * @param[in] up2_coeff Weight of the second up pixel - * @param[in] up3_coeff Weight of the third up pixel - * @param[in] up4_coeff Weight of the up pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] down1_coeff Weight of the down pixel - * @param[in] down2_coeff Weight of the second down pixel - * @param[in] down3_coeff Weight of the third down pixel - * @param[in] down4_coeff Weight of the most down pixel - * - * @return a short8 containing 8 convoluted values. - */ -VEC_DATA_TYPE(COMPUTE_TYPE, 8) -convolution9x1( - Image *src, - const short up1_coeff, - const short up2_coeff, - const short up3_coeff, - const short up4_coeff, - const short middle_coeff, - const short down1_coeff, - const short down2_coeff, - const short down3_coeff, - const short down4_coeff) -{ - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - val; - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up4_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down4_coeff; - - return out; -} - -/** Apply a 9x9 convolution matrix to a single channel U8 input image and return the result. - * - * Convolution matrix layout:\n - * [ mat0, mat1, mat2, mat3 , mat4, mat5, mat6, mat7, mat8 ]\n - * [ mat9, mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17 ]\n - * [ mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26 ]\n - * [ mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35 ]\n - * [ mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44 ]\n - * [ mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53 ]\n - * [ mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62 ] - * [ mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71 ] - * [ mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80 ] - * - * @param[in] src A pointer to source Image structure. - * @param[in] mat0 Coefficient from the convolution matrix - * @param[in] mat1 Coefficient from the convolution matrix - * @param[in] mat2 Coefficient from the convolution matrix - * @param[in] mat3 Coefficient from the convolution matrix - * @param[in] mat4 Coefficient from the convolution matrix - * @param[in] mat5 Coefficient from the convolution matrix - * @param[in] mat6 Coefficient from the convolution matrix - * @param[in] mat7 Coefficient from the convolution matrix - * @param[in] mat8 Coefficient from the convolution matrix - * @param[in] mat9 Coefficient from the convolution matrix - * @param[in] mat10 Coefficient from the convolution matrix - * @param[in] mat11 Coefficient from the convolution matrix - * @param[in] mat12 Coefficient from the convolution matrix - * @param[in] mat13 Coefficient from the convolution matrix - * @param[in] mat14 Coefficient from the convolution matrix - * @param[in] mat15 Coefficient from the convolution matrix - * @param[in] mat16 Coefficient from the convolution matrix - * @param[in] mat17 Coefficient from the convolution matrix - * @param[in] mat18 Coefficient from the convolution matrix - * @param[in] mat19 Coefficient from the convolution matrix - * @param[in] mat20 Coefficient from the convolution matrix - * @param[in] mat21 Coefficient from the convolution matrix - * @param[in] mat22 Coefficient from the convolution matrix - * @param[in] mat23 Coefficient from the convolution matrix - * @param[in] mat24 Coefficient from the convolution matrix - * @param[in] mat25 Coefficient from the convolution matrix - * @param[in] mat26 Coefficient from the convolution matrix - * @param[in] mat27 Coefficient from the convolution matrix - * @param[in] mat28 Coefficient from the convolution matrix - * @param[in] mat29 Coefficient from the convolution matrix - * @param[in] mat30 Coefficient from the convolution matrix - * @param[in] mat31 Coefficient from the convolution matrix - * @param[in] mat32 Coefficient from the convolution matrix - * @param[in] mat33 Coefficient from the convolution matrix - * @param[in] mat34 Coefficient from the convolution matrix - * @param[in] mat35 Coefficient from the convolution matrix - * @param[in] mat36 Coefficient from the convolution matrix - * @param[in] mat37 Coefficient from the convolution matrix - * @param[in] mat38 Coefficient from the convolution matrix - * @param[in] mat39 Coefficient from the convolution matrix - * @param[in] mat40 Coefficient from the convolution matrix - * @param[in] mat41 Coefficient from the convolution matrix - * @param[in] mat42 Coefficient from the convolution matrix - * @param[in] mat43 Coefficient from the convolution matrix - * @param[in] mat44 Coefficient from the convolution matrix - * @param[in] mat45 Coefficient from the convolution matrix - * @param[in] mat46 Coefficient from the convolution matrix - * @param[in] mat47 Coefficient from the convolution matrix - * @param[in] mat48 Coefficient from the convolution matrix - * @param[in] mat49 Coefficient from the convolution matrix - * @param[in] mat50 Coefficient from the convolution matrix - * @param[in] mat51 Coefficient from the convolution matrix - * @param[in] mat52 Coefficient from the convolution matrix - * @param[in] mat53 Coefficient from the convolution matrix - * @param[in] mat54 Coefficient from the convolution matrix - * @param[in] mat55 Coefficient from the convolution matrix - * @param[in] mat56 Coefficient from the convolution matrix - * @param[in] mat57 Coefficient from the convolution matrix - * @param[in] mat58 Coefficient from the convolution matrix - * @param[in] mat59 Coefficient from the convolution matrix - * @param[in] mat60 Coefficient from the convolution matrix - * @param[in] mat61 Coefficient from the convolution matrix - * @param[in] mat62 Coefficient from the convolution matrix - * @param[in] mat63 Coefficient from the convolution matrix - * @param[in] mat64 Coefficient from the convolution matrix - * @param[in] mat65 Coefficient from the convolution matrix - * @param[in] mat66 Coefficient from the convolution matrix - * @param[in] mat67 Coefficient from the convolution matrix - * @param[in] mat68 Coefficient from the convolution matrix - * @param[in] mat69 Coefficient from the convolution matrix - * @param[in] mat70 Coefficient from the convolution matrix - * @param[in] mat71 Coefficient from the convolution matrix - * @param[in] mat72 Coefficient from the convolution matrix - * @param[in] mat73 Coefficient from the convolution matrix - * @param[in] mat74 Coefficient from the convolution matrix - * @param[in] mat75 Coefficient from the convolution matrix - * @param[in] mat76 Coefficient from the convolution matrix - * @param[in] mat77 Coefficient from the convolution matrix - * @param[in] mat78 Coefficient from the convolution matrix - * @param[in] mat79 Coefficient from the convolution matrix - * @param[in] mat80 Coefficient from the convolution matrix - * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0) - * - */ -short8 convolution9x9( - Image *src, - const short mat0, const short mat1, const short mat2, const short mat3, const short mat4, - const short mat5, const short mat6, const short mat7, const short mat8, const short mat9, - const short mat10, const short mat11, const short mat12, const short mat13, const short mat14, - const short mat15, const short mat16, const short mat17, const short mat18, const short mat19, - const short mat20, const short mat21, const short mat22, const short mat23, const short mat24, - const short mat25, const short mat26, const short mat27, const short mat28, const short mat29, - const short mat30, const short mat31, const short mat32, const short mat33, const short mat34, - const short mat35, const short mat36, const short mat37, const short mat38, const short mat39, - const short mat40, const short mat41, const short mat42, const short mat43, const short mat44, - const short mat45, const short mat46, const short mat47, const short mat48, const short mat49, - const short mat50, const short mat51, const short mat52, const short mat53, const short mat54, - const short mat55, const short mat56, const short mat57, const short mat58, const short mat59, - const short mat60, const short mat61, const short mat62, const short mat63, const short mat64, - const short mat65, const short mat66, const short mat67, const short mat68, const short mat69, - const short mat70, const short mat71, const short mat72, const short mat73, const short mat74, - const short mat75, const short mat76, const short mat77, const short mat78, const short mat79, - const short mat80, uint scale) -{ - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels; - - pixels = convolution1x9(offset(src, -4, -4), mat0, mat1, mat2, mat3, mat4, mat5, mat6, mat7, mat8); - pixels += convolution1x9(offset(src, -4, -3), mat9, mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17); - pixels += convolution1x9(offset(src, -4, -2), mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26); - pixels += convolution1x9(offset(src, -4, -1), mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35); - pixels += convolution1x9(offset(src, -4, 0), mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44); - pixels += convolution1x9(offset(src, -4, 1), mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53); - pixels += convolution1x9(offset(src, -4, 2), mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62); - pixels += convolution1x9(offset(src, -4, 3), mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71); - pixels += convolution1x9(offset(src, -4, 4), mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80); - - if(scale > 0) - { - pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale; - } - - return convert_short8_sat(pixels); -} - -#ifndef DYNAMIC_MATRIX_CONVOLUTION - -/** Apply a 1x9 static convolution matrix to a single channel U8 input image and output a single temporary channel image. - * - * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8) and DATA_TYPE need to be passed at compile time:\n - * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT8=8, -DCOMPUTE_TYPE=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U16, S16, S32 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution_separable1x9_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Output pixels - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels = convolution1x9(offset(&src, -4, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8); - - // Store result in dst - vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr); -} - -/** Apply a 9x1 static convolution matrix to a single channel U8 input image and output a single channel image. - * - * @attention The matrix coefficients (MAT9, MAT10, ... MAT17, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n - * e.g. -DMAT9=9 -DMAT10=10, ... -DMAT17=17, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U16, S16, S32 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution_separable9x1_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Output pixels - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - pixels = convolution9x1(&src, MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17); - - // Divide by the scale - pixels = pixels / (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE; - - // Store result in dst - vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); -} - -/** Apply a static 9x9 convolution matrix to a single channel U8 input image and output a single channel image including borders - * - * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n - * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DDATA_TYPE_OUT=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution9x9_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - short8 pixels = convolution9x9(&src, - MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, - MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25, - MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37, - MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, MAT49, - MAT50, MAT51, MAT52, MAT53, MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61, - MAT62, MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71, MAT72, MAT73, - MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80, SCALE); - - // Store the result as is in dst - vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); -} - -#endif // DYNAMIC_MATRIX_CONVOLUTION diff --git a/src/core/CL/cl_kernels/convolution_rectangle.cl b/src/core/CL/cl_kernels/convolution_rectangle.cl deleted file mode 100644 index 925a698628..0000000000 --- a/src/core/CL/cl_kernels/convolution_rectangle.cl +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "convolution3x3.cl" -#include "convolution5x5.cl" -#include "convolution7x7.cl" -#include "convolution9x9.cl" -#include "helpers.h" - -#define MAT_INDEX(i) MAT##i - -#ifndef DATA_TYPE -#define DATA_TYPE short -#endif /* DATA_TYPE */ - -#ifndef COMPUTE_TYPE -#define COMPUTE_TYPE int -#endif /* COMPUTE_TYPE */ - -#ifndef DATA_TYPE_OUT -#define DATA_TYPE_OUT uchar -#endif /* DATA_TYPE_OUT */ - -#ifndef DYNAMIC_MATRIX_CONVOLUTION - -/** Apply a rectangle matrix to a single channel U8 input image and output a single channel image including borders - * - * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), MATRIX_WIDTH, MATRIX_HEIGHT, COMPUTE_TYPE, DATA_TYPE, DATA_TYPE_OUT need to be passed at compile time:\n - * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DMATRIX_WIDTH=3, -DMATRIX_HEIGHT=5, -DCOMPUTE_TYPE=int, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution_rectangle( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - short matrix_coeff[81] = - { - MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, - MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17, - MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25, MAT26, - MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, - MAT36, MAT37, MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, - MAT45, MAT46, MAT47, MAT48, MAT49, MAT50, MAT51, MAT52, MAT53, - MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61, MAT62, - MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71, - MAT72, MAT73, MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80 - }; - - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels = (VEC_DATA_TYPE(DATA_TYPE, 8))0; - - for(int i = 0; i < MATRIX_HEIGHT; i++) - { -#if MATRIX_WIDTH == 3 - pixels += convolution1x3(offset(&src, -1, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 3], matrix_coeff[1 + i * 3], - matrix_coeff[2 + i * 3]); -#endif /* MATRIX_WIDTH */ - -#if MATRIX_WIDTH == 5 - pixels += convolution1x5(offset(&src, -2, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 5], matrix_coeff[1 + i * 5], - matrix_coeff[2 + i * 5], matrix_coeff[3 + i * 5], matrix_coeff[4 + i * 5]); -#endif /* MATRIX_WIDTH */ - -#if MATRIX_WIDTH == 7 - pixels += convolution1x7(offset(&src, -3, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 7], matrix_coeff[1 + i * 7], - matrix_coeff[2 + i * 7], matrix_coeff[3 + i * 7], matrix_coeff[4 + i * 7], - matrix_coeff[5 + i * 7], matrix_coeff[6 + i * 7]); -#endif /* MATRIX_WIDTH */ - -#if MATRIX_WIDTH == 9 - pixels += convolution1x9(offset(&src, -4, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 9], matrix_coeff[1 + i * 9], - matrix_coeff[2 + i * 9], matrix_coeff[3 + i * 9], matrix_coeff[4 + i * 9], - matrix_coeff[5 + i * 9], matrix_coeff[6 + i * 9], matrix_coeff[7 + i * 9], matrix_coeff[8 + i * 9]); -#endif /* MATRIX_WIDTH */ - } - - pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))SCALE; - - // Store the result as is in dst - vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, ((__global DATA_TYPE_OUT *)dst.ptr)); -} - -#endif /* not DYNAMIC_MATRIX_CONVOLUTION */ diff --git a/src/core/CL/cl_kernels/derivative.cl b/src/core/CL/cl_kernels/derivative.cl deleted file mode 100644 index dddbb4d615..0000000000 --- a/src/core/CL/cl_kernels/derivative.cl +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This OpenCL kernel that computes the first-order derivative. - * - * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient - * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_gx_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void derivative( - IMAGE_DECLARATION(src) -#ifdef GRAD_X - , - IMAGE_DECLARATION(dst_gx) -#endif /* GRAD_X */ -#ifdef GRAD_Y - , - IMAGE_DECLARATION(dst_gy) -#endif /* GRAD_Y */ -) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); -#ifdef GRAD_X - Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx); -#endif /* GRAD_X */ -#ifdef GRAD_Y - Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy); -#endif /* GRAD_Y */ - -#ifdef GRAD_X - short16 l_data = convert_short16(vload16(0, offset(&src, -1, 0))); - short16 r_data = convert_short16(vload16(0, offset(&src, 1, 0))); - vstore16(r_data - l_data, 0, ((__global short *)dst_gx.ptr)); -#endif /* GRAD_X */ -#ifdef GRAD_Y - short16 t_data = convert_short16(vload16(0, offset(&src, 0, -1))); - short16 b_data = convert_short16(vload16(0, offset(&src, 0, 1))); - vstore16(b_data - t_data, 0, ((__global short *)dst_gy.ptr)); -#endif /* GRAD_Y */ -} diff --git a/src/core/CL/cl_kernels/dilate.cl b/src/core/CL/cl_kernels/dilate.cl deleted file mode 100644 index 14362c1f31..0000000000 --- a/src/core/CL/cl_kernels/dilate.cl +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This function dilates an input image. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void dilate( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 top = vload16(0, offset(&src, -1, -1)); - uchar16 middle = vload16(0, offset(&src, -1, 0)); - uchar16 bottom = vload16(0, offset(&src, -1, 1)); - - uchar16 tmp = max(top, max(middle, bottom)); - uchar8 out = max(tmp.s01234567, max(tmp.s12345678, tmp.s23456789)); - - vstore8(out, 0, dst.ptr); -} diff --git a/src/core/CL/cl_kernels/erode.cl b/src/core/CL/cl_kernels/erode.cl deleted file mode 100644 index 810c5fc51a..0000000000 --- a/src/core/CL/cl_kernels/erode.cl +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This function erodes an input image image. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void erode( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 top = vload16(0, offset(&src, -1, -1)); - uchar16 middle = vload16(0, offset(&src, -1, 0)); - uchar16 bottom = vload16(0, offset(&src, -1, 1)); - - uchar16 tmp = min(top, min(middle, bottom)); - uchar8 out = min(tmp.s01234567, min(tmp.s12345678, tmp.s23456789)); - - vstore8(out, 0, dst.ptr); -} diff --git a/src/core/CL/cl_kernels/fast_corners.cl b/src/core/CL/cl_kernels/fast_corners.cl deleted file mode 100644 index 89c144ab5e..0000000000 --- a/src/core/CL/cl_kernels/fast_corners.cl +++ /dev/null @@ -1,262 +0,0 @@ -/* - * Copyright (c) 2016-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "types.h" - -/* The map table to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P. - * - * . . F 0 1 . . . - * . E . . . 2 . . - * D . . . . . 3 . - * C . . P . . 4 . - * B . . . . . 5 . - * . A . . . 6 . . - * . . 9 8 7 . . . - */ -constant int offsets_s[16][2] = -{ - { 0, -3 }, // 0 - { 1, -3 }, // 1 - { 2, -2 }, // 2 - { 3, -1 }, // 3 - { 3, 0 }, // 4 - { 3, 1 }, // 5 - { 2, 2 }, // 6 - { 1, 3 }, // 7 - { 0, 3 }, // 8 - { -1, 3 }, // 9 - { -2, 2 }, // A - { -3, 1 }, // B - { -3, 0 }, // C - { -3, -1 }, // D - { -2, -2 }, // E - { -1, -3 }, // F -}; - -/** Load a pixel and set the mask values. - * - * @param[in] ptr The pointer to the starting address of source image - * @param[in] a Index to indicate the position in the Bresenham circle - * @param[in] stride Stride of source image in x dimension - * @param[in] dark The left end of the threshold range - * @param[in] bright The right end of the threshold range - * @param[out] dark_mask The bit-set mask records dark pixels. Its bit is set as 1 if the corresponding pixel is dark - * @param[out] bright_mask The bit-set mask records bright pixels. Its bit is set as 1 if the corresponding pixel is bright - * - */ -#define LOAD_AND_SET_MASK(ptr, a, stride, dark, bright, dark_mask, bright_mask) \ - { \ - unsigned char pixel; \ - pixel = *(ptr + (int)stride * offsets_s[a][1] + offsets_s[a][0]); \ - dark_mask |= (pixel < dark) << a; \ - bright_mask |= (pixel > bright) << a; \ - } - -/** Checks if a pixel is a corner. Pixel is considerred as a corner if the 9 continuous pixels in the Bresenham circle are bright or dark. - * - * @param[in] bright_mask The mask recording postions of bright pixels - * @param[in] dark_mask The mask recording postions of dark pixels - * @param[out] isCorner Indicate whether candidate pixel is corner - */ -#define CHECK_CORNER(bright_mask, dark_mask, isCorner) \ - { \ - for(int i = 0; i < 16; i++) \ - { \ - isCorner |= ((bright_mask & 0x1FF) == 0x1FF); \ - isCorner |= ((dark_mask & 0x1FF) == 0x1FF); \ - if(isCorner) \ - { \ - break; \ - } \ - bright_mask >>= 1; \ - dark_mask >>= 1; \ - } \ - } - -/* Calculate pixel's strength */ -uchar compute_strength(uchar candidate_pixel, __global unsigned char *ptr, unsigned int stride, unsigned char threshold) -{ - short a = threshold; - short b = 255; - while(b - a > 1) - { - uchar c = convert_uchar_sat((a + b) / 2); - unsigned int bright_mask = 0; - unsigned int dark_mask = 0; - - unsigned char p_bright = add_sat(candidate_pixel, c); - unsigned char p_dark = sub_sat(candidate_pixel, c); - - bool isCorner = 0; - - for(uint i = 0; i < 16; i++) - { - LOAD_AND_SET_MASK(ptr, i, stride, p_dark, p_bright, dark_mask, bright_mask) - } - - bright_mask |= (bright_mask << 16); - dark_mask |= (dark_mask << 16); - CHECK_CORNER(bright_mask, dark_mask, isCorner); - - if(isCorner) - { - a = convert_short(c); - } - else - { - b = convert_short(c); - } - } - return a; -} - -/** Fast corners implementation. Calculates and returns the strength of each pixel. - * - * The algorithm loops through the 16 pixels in the Bresenham circle and set low 16 bit of masks if corresponding pixel is bright - * or dark. It then copy the low 16 bit to the high 16 bit of the masks. Right shift the bit to check whether the 9 continuous bits - * from the LSB are set. - * - * @param[in] input_ptr Pointer to the first source image. Supported data types: U8 - * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[out] output_ptr Pointer to the first source image. Supported data types: U8 - * @param[in] output_stride_x Stride of the first source image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the first source image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[in] threshold_value Threshold value. - * - */ -__kernel void fast_corners( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output), - float threshold_value) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out = CONVERT_TO_IMAGE_STRUCT(output); - - const unsigned char threshold = (uchar)threshold_value; - - unsigned int bright_mask = 0; - unsigned int dark_mask = 0; - - unsigned char isCorner = 0; - - unsigned char p = *in.ptr; - unsigned char p_bright = add_sat(p, threshold); - unsigned char p_dark = sub_sat(p, threshold); - - LOAD_AND_SET_MASK(in.ptr, 0, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 4, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 8, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 12, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - - if(((bright_mask | dark_mask) & 0x1111) == 0) - { - *out.ptr = 0; - return; - } - - LOAD_AND_SET_MASK(in.ptr, 1, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 2, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 3, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 5, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 6, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 7, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 9, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 10, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 11, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 13, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 14, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 15, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - - bright_mask |= (bright_mask << 16); - dark_mask |= (dark_mask << 16); - - CHECK_CORNER(bright_mask, dark_mask, isCorner) - - if(!isCorner) - { - *out.ptr = 0; - return; - } - -#ifdef USE_MAXSUPPRESSION - *out.ptr = compute_strength(p, in.ptr, input_stride_y, threshold); -#else /* USE_MAXSUPPRESSION */ - *out.ptr = 1; -#endif /* USE_MAXSUPPRESSION */ -} - -/** Copy result to Keypoint buffer and count number of corners - * - * @param[in] input_ptr Pointer to the image with calculated strenghs. Supported data types: U8 - * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[in] max_num_points The maximum number of keypoints the array can hold - * @param[out] offset The number of skipped pixels in x dimension - * @param[out] num_of_points Number of points found - * @param[out] out The keypoints found - * - */ -__kernel void copy_to_keypoint( - IMAGE_DECLARATION(input), - uint max_num_points, - uint offset, - __global uint *num_of_points, - __global Keypoint *out) -{ -#ifndef UPDATE_NUMBER - if(*num_of_points >= max_num_points) - { - return; - } -#endif /* UPDATE_NUMBER */ - - Image in = CONVERT_TO_IMAGE_STRUCT(input); - - uchar value = *in.ptr; - - if(value > 0) - { - int id = atomic_inc(num_of_points); - if(id < max_num_points) - { - out[id].strength = value; - out[id].x = get_global_id(0) + offset; - out[id].y = get_global_id(1) + offset; - out[id].tracking_status = 1; - out[id].scale = 0.f; - out[id].orientation = 0.f; - out[id].error = 0.f; - } - } -} diff --git a/src/core/CL/cl_kernels/gaussian_pyramid.cl b/src/core/CL/cl_kernels/gaussian_pyramid.cl deleted file mode 100644 index ae2c31a848..0000000000 --- a/src/core/CL/cl_kernels/gaussian_pyramid.cl +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** Computes the Gaussian Filter 1x5 + sub-sampling along the X direction - * - * @note Each thread computes 8 pixels - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void gaussian1x5_sub_x( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values for the convolution (20 bytes needed) - uchar16 temp0 = vload16(0, src.ptr); - uchar4 temp1 = vload4(0, src.ptr + 16); - - // Convert to USHORT8 - ushort8 l2_data = convert_ushort8((uchar8)(temp0.s02468ACE)); - ushort8 l1_data = convert_ushort8((uchar8)(temp0.s13579BDF)); - ushort8 m_data = convert_ushort8((uchar8)(temp0.s2468, temp0.sACE, temp1.s0)); - ushort8 r1_data = convert_ushort8((uchar8)(temp0.s3579, temp0.sBDF, temp1.s1)); - ushort8 r2_data = convert_ushort8((uchar8)(temp0.s468A, temp0.sCE, temp1.s02)); - - // Compute convolution along the X direction - ushort8 pixels = l2_data + r2_data; - pixels += l1_data * (ushort8)4; - pixels += m_data * (ushort8)6; - pixels += r1_data * (ushort8)4; - - // Store result - vstore8(pixels, 0, (__global ushort *)dst.ptr); -} - -/** Computes the Gaussian Filter 5x1 + sub-sampling along the Y direction - * - * @note Each thread computes 8 pixels - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void gaussian5x1_sub_y( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values - ushort8 u2_data = vload8(0, (__global ushort *)offset(&src, 0, 0)); - ushort8 u1_data = vload8(0, (__global ushort *)offset(&src, 0, 1)); - ushort8 m_data = vload8(0, (__global ushort *)offset(&src, 0, 2)); - ushort8 d1_data = vload8(0, (__global ushort *)offset(&src, 0, 3)); - ushort8 d2_data = vload8(0, (__global ushort *)offset(&src, 0, 4)); - - // Compute convolution along the Y direction - ushort8 pixels = u2_data + d2_data; - pixels += u1_data * (ushort8)4; - pixels += m_data * (ushort8)6; - pixels += d1_data * (ushort8)4; - - // Scale result - pixels >>= (ushort8)8; - - // Store result - vstore8(convert_uchar8_sat(pixels), 0, dst.ptr); -} diff --git a/src/core/CL/cl_kernels/harris_corners.cl b/src/core/CL/cl_kernels/harris_corners.cl deleted file mode 100644 index 3e3c9fd23c..0000000000 --- a/src/core/CL/cl_kernels/harris_corners.cl +++ /dev/null @@ -1,376 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** Function running harris score on 3x3 block size - * - * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int. - * e.g. -DDATA_TYPE=short. - * - * @param[in] src_gx_ptr Pointer to the first source image. Supported data types: S16, S32 - * @param[in] src_gx_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_gx_step_x src_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_gx_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_gx_step_y src_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_gx_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src_gy_ptr Pointer to the second source image. Supported data types: S16, S32 - * @param[in] src_gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] src_gy_step_x src_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] src_gy_step_y src_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_gy_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[out] vc_ptr Pointer to the destination image. Supported data types: F32 - * @param[in] vc_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] vc_step_x vc_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] vc_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] vc_step_y vc_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] vc_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] sensitivity Sensitivity threshold k from the Harris-Stephens equation - * @param[in] strength_thresh Minimum threshold with which to eliminate Harris Corner scores - * @param[in] pow4_normalization_factor Normalization factor to apply harris score - */ -__kernel void harris_score_3x3( - IMAGE_DECLARATION(src_gx), - IMAGE_DECLARATION(src_gy), - IMAGE_DECLARATION(vc), - float sensitivity, - float strength_thresh, - float pow4_normalization_factor) -{ - Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx); - Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy); - Image vc = CONVERT_TO_IMAGE_STRUCT(vc); - - /* Gx^2, Gy^2 and Gx*Gy */ - float4 gx2 = (float4)0.0f; - float4 gy2 = (float4)0.0f; - float4 gxgy = (float4)0.0f; - - /* Row0 */ - VEC_DATA_TYPE(DATA_TYPE, 8) - temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, -1)); - VEC_DATA_TYPE(DATA_TYPE, 8) - temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, -1)); - - float4 l_gx = convert_float4(temp_gx.s0123); - float4 m_gx = convert_float4(temp_gx.s1234); - float4 r_gx = convert_float4(temp_gx.s2345); - - float4 l_gy = convert_float4(temp_gy.s0123); - float4 m_gy = convert_float4(temp_gy.s1234); - float4 r_gy = convert_float4(temp_gy.s2345); - - gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx); - gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy); - gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy); - - /* Row1 */ - temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 0)); - temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 0)); - - l_gx = convert_float4(temp_gx.s0123); - m_gx = convert_float4(temp_gx.s1234); - r_gx = convert_float4(temp_gx.s2345); - - l_gy = convert_float4(temp_gy.s0123); - m_gy = convert_float4(temp_gy.s1234); - r_gy = convert_float4(temp_gy.s2345); - - gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx); - gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy); - gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy); - - /* Row2 */ - temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 1)); - temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 1)); - - l_gx = convert_float4(temp_gx.s0123); - m_gx = convert_float4(temp_gx.s1234); - r_gx = convert_float4(temp_gx.s2345); - - l_gy = convert_float4(temp_gy.s0123); - m_gy = convert_float4(temp_gy.s1234); - r_gy = convert_float4(temp_gy.s2345); - - gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx); - gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy); - gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy); - - /* Compute trace and determinant */ - float4 trace = gx2 + gy2; - float4 det = gx2 * gy2 - (gxgy * gxgy); - - /* Compute harris score */ - float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor; - - mc = select(0.0f, mc, mc > (float4)strength_thresh); - - vstore4(mc, 0, (__global float *)vc.ptr); -} - -/** Function for calculating harris score 1x5. - * - * @param[in] src_gx Pointer to gx gradient image. - * @param[in] src_gy Pointer to gy gradient image. - * @param[in] row Relative row. - */ -inline float16 harris_score_1x5(Image *src_gx, Image *src_gy, int row) -{ - float4 gx2 = 0.0f; - float4 gy2 = 0.0f; - float4 gxgy = 0.0f; - - /* Row */ - VEC_DATA_TYPE(DATA_TYPE, 8) - temp_gx = vload8(0, (__global DATA_TYPE *)offset(src_gx, -2, row)); - VEC_DATA_TYPE(DATA_TYPE, 8) - temp_gy = vload8(0, (__global DATA_TYPE *)offset(src_gy, -2, row)); - - float4 gx = convert_float4(temp_gx.s0123); - float4 gy = convert_float4(temp_gy.s0123); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4(temp_gx.s1234); - gy = convert_float4(temp_gy.s1234); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4(temp_gx.s2345); - gy = convert_float4(temp_gy.s2345); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4(temp_gx.s3456); - gy = convert_float4(temp_gy.s3456); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4(temp_gx.s4567); - gy = convert_float4(temp_gy.s4567); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - return (float16)(gx2, gy2, gxgy, (float4)0); -} - -/** Function running harris score on 5x5 block size - * - * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int. - * e.g. -DDATA_TYPE=short. - * - * @param[in] src_gx_ptr Pointer to the first source image. Supported data types: S16, S32 - * @param[in] src_gx_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_gx_step_x src_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_gx_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_gx_step_y src_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_gx_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src_gy_ptr Pointer to the second source image. Supported data types: S16, S32 - * @param[in] src_gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] src_gy_step_x src_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] src_gy_step_y src_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_gy_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[out] vc_ptr Pointer to the destination image. Supported data types: F32 - * @param[in] vc_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] vc_step_x vc_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] vc_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] vc_step_y vc_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] vc_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] sensitivity Sensitivity threshold k from the Harris-Stephens equation - * @param[in] strength_thresh Minimum threshold with which to eliminate Harris Corner scores - * @param[in] pow4_normalization_factor Normalization factor to apply harris score - */ -__kernel void harris_score_5x5( - IMAGE_DECLARATION(src_gx), - IMAGE_DECLARATION(src_gy), - IMAGE_DECLARATION(vc), - float sensitivity, - float strength_thresh, - float pow4_normalization_factor) -{ - Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx); - Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy); - Image vc = CONVERT_TO_IMAGE_STRUCT(vc); - - /* Gx^2, Gy^2 and Gx*Gy */ - float16 res = (float16)0.0f; - - /* Compute row */ - for(int i = -2; i < 3; i++) - { - res += harris_score_1x5(&src_gx, &src_gy, i); - } - - float4 gx2 = res.s0123; - float4 gy2 = res.s4567; - float4 gxgy = res.s89AB; - - /* Compute trace and determinant */ - float4 trace = gx2 + gy2; - float4 det = gx2 * gy2 - (gxgy * gxgy); - - /* Compute harris score */ - float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor; - - mc = select(0.0f, mc, mc > (float4)strength_thresh); - - vstore4(mc, 0, (__global float *)vc.ptr); -} - -/** Function for calculating harris score 1x7. - * - * @param[in] src_gx Pointer to gx gradient image. - * @param[in] src_gy Pointer to gy gradient image. - * @param[in] row Relative row. - */ -inline float16 harris_score_1x7(Image *src_gx, Image *src_gy, int row) -{ - float4 gx2 = 0.0f; - float4 gy2 = 0.0f; - float4 gxgy = 0.0f; - - /* Row */ - VEC_DATA_TYPE(DATA_TYPE, 8) - temp_gx0 = vload8(0, (__global DATA_TYPE *)offset(src_gx, -3, row)); - VEC_DATA_TYPE(DATA_TYPE, 8) - temp_gy0 = vload8(0, (__global DATA_TYPE *)offset(src_gy, -3, row)); - VEC_DATA_TYPE(DATA_TYPE, 2) - temp_gx1 = vload2(0, (__global DATA_TYPE *)offset(src_gx, 5, row)); - VEC_DATA_TYPE(DATA_TYPE, 2) - temp_gy1 = vload2(0, (__global DATA_TYPE *)offset(src_gy, 5, row)); - - float4 gx = convert_float4(temp_gx0.s0123); - float4 gy = convert_float4(temp_gy0.s0123); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4(temp_gx0.s1234); - gy = convert_float4(temp_gy0.s1234); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4(temp_gx0.s2345); - gy = convert_float4(temp_gy0.s2345); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4(temp_gx0.s3456); - gy = convert_float4(temp_gy0.s3456); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4(temp_gx0.s4567); - gy = convert_float4(temp_gy0.s4567); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s567, temp_gx1.s0)); - gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s567, temp_gy1.s0)); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s67, temp_gx1.s01)); - gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s67, temp_gy1.s01)); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - return (float16)(gx2, gy2, gxgy, (float4)0); -} - -/** Function running harris score on 7x7 block size - * - * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int. - * e.g. -DDATA_TYPE=short. - * - * @param[in] src_gx_ptr Pointer to the first source image. Supported data types: S16, S32 - * @param[in] src_gx_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_gx_step_x src_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_gx_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_gx_step_y src_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_gx_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src_gy_ptr Pointer to the second source image. Supported data types: S16, S32 - * @param[in] src_gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] src_gy_step_x src_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] src_gy_step_y src_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_gy_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[out] vc_ptr Pointer to the destination image. Supported data types: F32 - * @param[in] vc_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] vc_step_x vc_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] vc_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] vc_step_y vc_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] vc_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] sensitivity Sensitivity threshold k from the Harris-Stephens equation - * @param[in] strength_thresh Minimum threshold with which to eliminate Harris Corner scores - * @param[in] pow4_normalization_factor Normalization factor to apply harris score - */ -__kernel void harris_score_7x7( - IMAGE_DECLARATION(src_gx), - IMAGE_DECLARATION(src_gy), - IMAGE_DECLARATION(vc), - float sensitivity, - float strength_thresh, - float pow4_normalization_factor) -{ - Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx); - Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy); - Image vc = CONVERT_TO_IMAGE_STRUCT(vc); - - /* Gx^2, Gy^2 and Gx*Gy */ - float16 res = (float16)0.0f; - - /* Compute row */ - for(int i = -3; i < 4; i++) - { - res += harris_score_1x7(&src_gx, &src_gy, i); - } - - float4 gx2 = res.s0123; - float4 gy2 = res.s4567; - float4 gxgy = res.s89AB; - - /* Compute trace and determinant */ - float4 trace = gx2 + gy2; - float4 det = gx2 * gy2 - (gxgy * gxgy); - - /* Compute harris score */ - float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor; - - mc = select(0.0f, mc, mc > (float4)strength_thresh); - - vstore4(mc, 0, (__global float *)vc.ptr); -} diff --git a/src/core/CL/cl_kernels/histogram.cl b/src/core/CL/cl_kernels/histogram.cl deleted file mode 100644 index a93cb4d1c7..0000000000 --- a/src/core/CL/cl_kernels/histogram.cl +++ /dev/null @@ -1,243 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#define VATOMIC_INC16(histogram, win_pos) \ - { \ - atomic_inc(histogram + win_pos.s0); \ - atomic_inc(histogram + win_pos.s1); \ - atomic_inc(histogram + win_pos.s2); \ - atomic_inc(histogram + win_pos.s3); \ - atomic_inc(histogram + win_pos.s4); \ - atomic_inc(histogram + win_pos.s5); \ - atomic_inc(histogram + win_pos.s6); \ - atomic_inc(histogram + win_pos.s7); \ - atomic_inc(histogram + win_pos.s8); \ - atomic_inc(histogram + win_pos.s9); \ - atomic_inc(histogram + win_pos.sa); \ - atomic_inc(histogram + win_pos.sb); \ - atomic_inc(histogram + win_pos.sc); \ - atomic_inc(histogram + win_pos.sd); \ - atomic_inc(histogram + win_pos.se); \ - atomic_inc(histogram + win_pos.sf); \ - } - -/** Calculate the histogram of an 8 bit grayscale image. - * - * Each thread will process 16 pixels and use one local atomic operation per pixel. - * When all work items in a work group are done the resulting local histograms are - * added to the global histogram using global atomics. - * - * @note The input image is represented as a two-dimensional array of type uchar. - * The output is represented as a one-dimensional uint array of length of num_bins - * - * @param[in] input_ptr Pointer to the first source image. Supported data types: U8 - * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[in] histogram_local The local buffer to hold histogram result in per workgroup. Supported data types: U32 - * @param[out] histogram The output buffer to hold histogram final result. Supported data types: U32 - * @param[out] num_bins The number of bins - * @param[out] offset The start of values to use (inclusive) - * @param[out] range The range of a bin - * @param[out] offrange The maximum value (exclusive) - */ -__kernel void hist_local_kernel(IMAGE_DECLARATION(input), - __local uint *histogram_local, - __global uint *restrict histogram, - uint num_bins, - uint offset, - uint range, - uint offrange) -{ - Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input); - uint local_id_x = get_local_id(0); - - uint local_x_size = get_local_size(0); - - if(num_bins > local_x_size) - { - for(int i = local_id_x; i < num_bins; i += local_x_size) - { - histogram_local[i] = 0; - } - } - else - { - if(local_id_x <= num_bins) - { - histogram_local[local_id_x] = 0; - } - } - - uint16 vals = convert_uint16(vload16(0, input_buffer.ptr)); - - uint16 win_pos = select(num_bins, ((vals - offset) * num_bins) / range, (vals >= offset && vals < offrange)); - - barrier(CLK_LOCAL_MEM_FENCE); - VATOMIC_INC16(histogram_local, win_pos); - barrier(CLK_LOCAL_MEM_FENCE); - - if(num_bins > local_x_size) - { - for(int i = local_id_x; i < num_bins; i += local_x_size) - { - atomic_add(histogram + i, histogram_local[i]); - } - } - else - { - if(local_id_x <= num_bins) - { - atomic_add(histogram + local_id_x, histogram_local[local_id_x]); - } - } -} - -/** Calculate the histogram of an 8 bit grayscale image's border. - * - * Each thread will process one pixel using global atomic. - * When all work items in a work group are done the resulting local histograms are - * added to the global histogram using global atomics. - * - * @note The input image is represented as a two-dimensional array of type uchar. - * The output is represented as a one-dimensional uint array of length of num_bins - * - * @param[in] input_ptr Pointer to the first source image. Supported data types: U8 - * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[out] histogram The output buffer to hold histogram final result. Supported data types: U32 - * @param[out] num_bins The number of bins - * @param[out] offset The start of values to use (inclusive) - * @param[out] range The range of a bin - * @param[out] offrange The maximum value (exclusive) - */ -__kernel void hist_border_kernel(IMAGE_DECLARATION(input), - __global uint *restrict histogram, - uint num_bins, - uint offset, - uint range, - uint offrange) -{ - Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input); - - uint val = (uint)(*input_buffer.ptr); - - uint win_pos = (val >= offset) ? (((val - offset) * num_bins) / range) : 0; - - if(val >= offset && (val < offrange)) - { - atomic_inc(histogram + win_pos); - } -} - -/** Calculate the histogram of an 8 bit grayscale image with bin size of 256 and window size of 1. - * - * Each thread will process 16 pixels and use one local atomic operation per pixel. - * When all work items in a work group are done the resulting local histograms are - * added to the global histogram using global atomics. - * - * @note The input image is represented as a two-dimensional array of type uchar. - * The output is represented as a one-dimensional uint array of 256 elements - * - * @param[in] input_ptr Pointer to the first source image. Supported data types: U8 - * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[in] histogram_local The local buffer to hold histogram result in per workgroup. Supported data types: U32 - * @param[out] histogram The output buffer to hold histogram final result. Supported data types: U32 - */ -__kernel void hist_local_kernel_fixed(IMAGE_DECLARATION(input), - __local uint *histogram_local, - __global uint *restrict histogram) -{ - Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input); - - uint local_index = get_local_id(0); - uint local_x_size = get_local_size(0); - - for(int i = local_index; i < 256; i += local_x_size) - { - histogram_local[i] = 0; - } - - uint16 vals = convert_uint16(vload16(0, input_buffer.ptr)); - - barrier(CLK_LOCAL_MEM_FENCE); - - atomic_inc(histogram_local + vals.s0); - atomic_inc(histogram_local + vals.s1); - atomic_inc(histogram_local + vals.s2); - atomic_inc(histogram_local + vals.s3); - atomic_inc(histogram_local + vals.s4); - atomic_inc(histogram_local + vals.s5); - atomic_inc(histogram_local + vals.s6); - atomic_inc(histogram_local + vals.s7); - atomic_inc(histogram_local + vals.s8); - atomic_inc(histogram_local + vals.s9); - atomic_inc(histogram_local + vals.sa); - atomic_inc(histogram_local + vals.sb); - atomic_inc(histogram_local + vals.sc); - atomic_inc(histogram_local + vals.sd); - atomic_inc(histogram_local + vals.se); - atomic_inc(histogram_local + vals.sf); - - barrier(CLK_LOCAL_MEM_FENCE); - - for(int i = local_index; i < 256; i += local_x_size) - { - atomic_add(histogram + i, histogram_local[i]); - } -} - -/** Calculate the histogram of an 8 bit grayscale image with bin size as 256 and window size as 1. - * - * Each thread will process one pixel using global atomic. - * When all work items in a work group are done the resulting local histograms are - * added to the global histogram using global atomics. - * - * @note The input image is represented as a two-dimensional array of type uchar. - * The output is represented as a one-dimensional uint array of 256 - * - * @param[in] input_ptr Pointer to the first source image. Supported data types: U8 - * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[out] histogram The output buffer to hold histogram final result. Supported data types: U32 - */ -__kernel void hist_border_kernel_fixed(IMAGE_DECLARATION(input), - __global uint *restrict histogram) -{ - Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input); - atomic_inc(histogram + *input_buffer.ptr); -} diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl deleted file mode 100644 index b14f361df6..0000000000 --- a/src/core/CL/cl_kernels/hog.cl +++ /dev/null @@ -1,456 +0,0 @@ -/* - * Copyright (c) 2017-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "types.h" - -#if defined(CELL_WIDTH) && defined(CELL_HEIGHT) && defined(NUM_BINS) && defined(PHASE_SCALE) - -/** This OpenCL kernel computes the HOG orientation binning - * - * @attention The following variables must be passed at compile time: - * - * -# -DCELL_WIDTH = Width of the cell - * -# -DCELL_HEIGHT = height of the cell - * -# -DNUM_BINS = Number of bins for each cell - * -# -DPHASE_SCALE = Scale factor used to evaluate the index of the local HOG - * - * @note Each work-item computes a single cell - * - * @param[in] mag_ptr Pointer to the source image which stores the magnitude of the gradient for each pixel. Supported data types: S16 - * @param[in] mag_stride_x Stride of the magnitude image in X dimension (in bytes) - * @param[in] mag_step_x mag_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] mag_stride_y Stride of the magnitude image in Y dimension (in bytes) - * @param[in] mag_step_y mag_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] mag_offset_first_element_in_bytes The offset of the first element in the magnitude image - * @param[in] phase_ptr Pointer to the source image which stores the phase of the gradient for each pixel. Supported data types: U8 - * @param[in] phase_stride_x Stride of the phase image in X dimension (in bytes) - * @param[in] phase_step_x phase_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] phase_stride_y Stride of the the phase image in Y dimension (in bytes) - * @param[in] phase_step_y phase_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] phase_offset_first_element_in_bytes The offset of the first element in the the phase image - * @param[out] dst_ptr Pointer to the destination image which stores the local HOG for each cell Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void hog_orientation_binning(IMAGE_DECLARATION(mag), - IMAGE_DECLARATION(phase), - IMAGE_DECLARATION(dst)) -{ - float bins[NUM_BINS] = { 0 }; - - // Compute address for the magnitude and phase images - Image mag = CONVERT_TO_IMAGE_STRUCT(mag); - Image phase = CONVERT_TO_IMAGE_STRUCT(phase); - - __global uchar *mag_row_ptr = mag.ptr; - __global uchar *phase_row_ptr = phase.ptr; - - for(int yc = 0; yc < CELL_HEIGHT; ++yc) - { - int xc = 0; - for(; xc <= (CELL_WIDTH - 4); xc += 4) - { - // Load magnitude and phase values - const float4 mag_f32 = convert_float4(vload4(0, (__global short *)mag_row_ptr + xc)); - float4 phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc)); - - // Scale phase: phase * scale + 0.5f - phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE; - - // Compute histogram index. - int4 hidx_s32 = convert_int4(phase_f32); - - // Compute magnitude weights (w0 and w1) - const float4 hidx_f32 = convert_float4(hidx_s32); - - // w1 = phase_f32 - hidx_s32 - const float4 w1_f32 = phase_f32 - hidx_f32; - - // w0 = 1.0 - w1 - const float4 w0_f32 = (float4)1.0f - w1_f32; - - // Calculate the weights for splitting vote - const float4 mag_w0_f32 = mag_f32 * w0_f32; - const float4 mag_w1_f32 = mag_f32 * w1_f32; - - // Weighted vote between 2 bins - - // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0 - hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS)); - - // Bin 0 - bins[hidx_s32.s0] += mag_w0_f32.s0; - bins[hidx_s32.s1] += mag_w0_f32.s1; - bins[hidx_s32.s2] += mag_w0_f32.s2; - bins[hidx_s32.s3] += mag_w0_f32.s3; - - hidx_s32 += (int4)1; - - // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0 - hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS)); - - // Bin1 - bins[hidx_s32.s0] += mag_w1_f32.s0; - bins[hidx_s32.s1] += mag_w1_f32.s1; - bins[hidx_s32.s2] += mag_w1_f32.s2; - bins[hidx_s32.s3] += mag_w1_f32.s3; - } - - // Left over computation - for(; xc < CELL_WIDTH; xc++) - { - const float mag_value = *((__global short *)mag_row_ptr + xc); - const float phase_value = *(phase_row_ptr + xc) * (float)PHASE_SCALE + 0.5f; - const float w1 = phase_value - floor(phase_value); - - // The quantised phase is the histogram index [0, NUM_BINS - 1] - // Check limit of histogram index. If hidx == NUM_BINS, hidx = 0 - const uint hidx = (uint)(phase_value) % NUM_BINS; - - // Weighted vote between 2 bins - bins[hidx] += mag_value * (1.0f - w1); - bins[(hidx + 1) % NUM_BINS] += mag_value * w1; - } - - // Point to the next row of magnitude and phase images - mag_row_ptr += mag_stride_y; - phase_row_ptr += phase_stride_y; - } - - // Compute address for the destination image - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Store the local HOG in the global memory - int xc = 0; - for(; xc <= (NUM_BINS - 4); xc += 4) - { - float4 values = vload4(0, bins + xc); - - vstore4(values, 0, ((__global float *)dst.ptr) + xc); - } - - // Left over stores - for(; xc < NUM_BINS; ++xc) - { - ((__global float *)dst.ptr)[xc] = bins[xc]; - } -} -#endif /* CELL_WIDTH and CELL_HEIGHT and NUM_BINS and PHASE_SCALE */ - -#if defined(NUM_CELLS_PER_BLOCK_HEIGHT) && defined(NUM_BINS_PER_BLOCK_X) && defined(NUM_BINS_PER_BLOCK) && defined(HOG_NORM_TYPE) && defined(L2_HYST_THRESHOLD) - -#ifndef L2_NORM -#error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel -#endif /* not L2_NORM */ - -#ifndef L2HYS_NORM -#error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel -#endif /* not L2HYS_NORM */ - -#ifndef L1_NORM -#error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel -#endif /* not L1_NORM */ - -/** This OpenCL kernel computes the HOG block normalization - * - * @attention The following variables must be passed at compile time: - * - * -# -DNUM_CELLS_PER_BLOCK_HEIGHT = Number of cells for each block - * -# -DNUM_BINS_PER_BLOCK_X = Number of bins for each block along the X direction - * -# -DNUM_BINS_PER_BLOCK = Number of bins for each block - * -# -DHOG_NORM_TYPE = Normalization type - * -# -DL2_HYST_THRESHOLD = Threshold used for L2HYS_NORM normalization method - * -# -DL2_NORM = Value of the enum class HOGNormType::L2_NORM - * -# -DL2HYS_NORM = Value of the enum class HOGNormType::L2HYS_NORM - * -# -DL1_NORM = Value of the enum class HOGNormType::L1_NORM - * - * @note Each work-item computes a single block - * - * @param[in] src_ptr Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image which stores the normlized HOG Supported data types: F32. Number of channels supported: equal to the number of histogram bins per block - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void hog_block_normalization(IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - float sum = 0.0f; - float4 sum_f32 = (float4)(0.0f); - - // Compute address for the source and destination tensor - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc) - { - const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y); - - int xc = 0; - for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16) - { - const float4 val0 = vload4(0, hist_ptr + xc + 0); - const float4 val1 = vload4(0, hist_ptr + xc + 4); - const float4 val2 = vload4(0, hist_ptr + xc + 8); - const float4 val3 = vload4(0, hist_ptr + xc + 12); - -#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) - // Compute val^2 for L2_NORM or L2HYS_NORM - sum_f32 += val0 * val0; - sum_f32 += val1 * val1; - sum_f32 += val2 * val2; - sum_f32 += val3 * val3; -#else /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */ - // Compute |val| for L1_NORM - sum_f32 += fabs(val0); - sum_f32 += fabs(val1); - sum_f32 += fabs(val2); - sum_f32 += fabs(val3); -#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */ - - // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization. - // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values - // will be accessed consecutively - vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X); - vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X); - vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X); - vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X); - } - - // Compute left over - for(; xc < NUM_BINS_PER_BLOCK_X; ++xc) - { - const float val = hist_ptr[xc]; - -#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) - sum += val * val; -#else /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */ - sum += fabs(val); -#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */ - - ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val; - } - } - - sum += dot(sum_f32, (float4)1.0f); - - float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f); - -#if(HOG_NORM_TYPE == L2HYS_NORM) - // Reset sum - sum_f32 = (float4)0.0f; - sum = 0.0f; - - int k = 0; - for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16) - { - float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0); - float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4); - float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8); - float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12); - - // Scale val - val0 = val0 * (float4)scale; - val1 = val1 * (float4)scale; - val2 = val2 * (float4)scale; - val3 = val3 * (float4)scale; - - // Clip val if over _threshold_l2hys - val0 = fmin(val0, (float4)L2_HYST_THRESHOLD); - val1 = fmin(val1, (float4)L2_HYST_THRESHOLD); - val2 = fmin(val2, (float4)L2_HYST_THRESHOLD); - val3 = fmin(val3, (float4)L2_HYST_THRESHOLD); - - // Compute val^2 - sum_f32 += val0 * val0; - sum_f32 += val1 * val1; - sum_f32 += val2 * val2; - sum_f32 += val3 * val3; - - vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0); - vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4); - vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8); - vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12); - } - - // Compute left over - for(; k < NUM_BINS_PER_BLOCK; ++k) - { - float val = ((__global float *)dst.ptr)[k] * scale; - - // Clip scaled input_value if over L2_HYST_THRESHOLD - val = fmin(val, (float)L2_HYST_THRESHOLD); - - sum += val * val; - - ((__global float *)dst.ptr)[k] = val; - } - - sum += dot(sum_f32, (float4)1.0f); - - // We use the same constants of OpenCV - scale = 1.0f / (sqrt(sum) + 1e-3f); - -#endif /* (HOG_NORM_TYPE == L2HYS_NORM) */ - - int i = 0; - for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16) - { - float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0); - float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4); - float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8); - float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12); - - // Multiply val by the normalization scale factor - val0 = val0 * (float4)scale; - val1 = val1 * (float4)scale; - val2 = val2 * (float4)scale; - val3 = val3 * (float4)scale; - - vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0); - vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4); - vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8); - vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12); - } - - for(; i < NUM_BINS_PER_BLOCK; ++i) - { - ((__global float *)dst.ptr)[i] *= scale; - } -} -#endif /* NUM_CELLS_PER_BLOCK_HEIGHT and NUM_BINS_PER_BLOCK_X and NUM_BINS_PER_BLOCK and HOG_NORM_TYPE and L2_HYST_THRESHOLD */ - -#if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(DETECTION_WINDOW_STRIDE_WIDTH) && defined(DETECTION_WINDOW_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT) - -/** This OpenCL kernel computes the HOG detector using linear SVM - * - * @attention The following variables must be passed at compile time: - * - * -# -DNUM_BLOCKS_PER_DESCRIPTOR_Y = Number of blocks per descriptor along the Y direction - * -# -DNUM_BINS_PER_DESCRIPTOR_X = Number of bins per descriptor along the X direction - * -# -DTHRESHOLD = Threshold for the distance between features and SVM classifying plane - * -# -DMAX_NUM_DETECTION_WINDOWS = Maximum number of possible detection windows. It is equal to the size of the DetectioWindow array - * -# -DIDX_CLASS = Index of the class to detect - * -# -DDETECTION_WINDOW_STRIDE_WIDTH = Detection window stride for the X direction - * -# -DDETECTION_WINDOW_STRIDE_HEIGHT = Detection window stride for the Y direction - * -# -DDETECTION_WINDOW_WIDTH = Width of the detection window - * -# -DDETECTION_WINDOW_HEIGHT = Height of the detection window - * - * @note Each work-item computes a single detection window - * - * @param[in] src_ptr Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] hog_descriptor Pointer to HOG descriptor. Supported data types: F32 - * @param[out] dst Pointer to DetectionWindow array - * @param[out] num_detection_windows Number of objects detected - */ -__kernel void hog_detector(IMAGE_DECLARATION(src), - __global float *hog_descriptor, - __global DetectionWindow *dst, - __global uint *num_detection_windows) -{ - // Check if the DetectionWindow array is full - if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS) - { - return; - } - - Image src = CONVERT_TO_IMAGE_STRUCT(src); - - const int src_step_y_f32 = src_stride_y / sizeof(float); - - // Init score_f32 with 0 - float4 score_f32 = (float4)0.0f; - - // Init score with 0 - float score = 0.0f; - - __global float *src_row_ptr = (__global float *)src.ptr; - - // Compute Linear SVM - for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32) - { - int xb = 0; - - const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X; - - for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8) - { - // Load descriptor values - float4 a0_f32 = vload4(0, src_row_ptr + xb + 0); - float4 a1_f32 = vload4(0, src_row_ptr + xb + 4); - - float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y); - float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y); - - // Multiply accumulate - score_f32 += a0_f32 * b0_f32; - score_f32 += a1_f32 * b1_f32; - } - - for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb) - { - const float a = src_row_ptr[xb]; - const float b = hog_descriptor[xb + offset_y]; - - score += a * b; - } - } - - score += dot(score_f32, (float4)1.0f); - - // Add the bias. The bias is located at the position (descriptor_size() - 1) - // (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y - score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y]; - - if(score > (float)THRESHOLD) - { - int id = atomic_inc(num_detection_windows); - if(id < MAX_NUM_DETECTION_WINDOWS) - { - dst[id].x = get_global_id(0) * DETECTION_WINDOW_STRIDE_WIDTH; - dst[id].y = get_global_id(1) * DETECTION_WINDOW_STRIDE_HEIGHT; - dst[id].width = DETECTION_WINDOW_WIDTH; - dst[id].height = DETECTION_WINDOW_HEIGHT; - dst[id].idx_class = IDX_CLASS; - dst[id].score = score; - } - } -} -#endif /* NUM_BLOCKS_PER_DESCRIPTOR_Y && NUM_BINS_PER_DESCRIPTOR_X && THRESHOLD && MAX_NUM_DETECTION_WINDOWS && IDX_CLASS && - * DETECTION_WINDOW_STRIDE_WIDTH && DETECTION_WINDOW_STRIDE_HEIGHT && DETECTION_WINDOW_WIDTH && DETECTION_WINDOW_HEIGHT */ diff --git a/src/core/CL/cl_kernels/integral_image.cl b/src/core/CL/cl_kernels/integral_image.cl deleted file mode 100644 index dd2c7982f4..0000000000 --- a/src/core/CL/cl_kernels/integral_image.cl +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This function computes the horizontal integral of the image. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U32 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void integral_horizontal( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uint prev = 0; - - for(uint j = 0; j < src_step_x; j += 16) - { - barrier(CLK_GLOBAL_MEM_FENCE); - uint16 res = convert_uint16(vload16(0, offset(&src, j, 0))); - res.s0 += prev; - res.s1 += res.s0; - res.s2 += res.s1; - res.s3 += res.s2; - res.s4 += res.s3; - res.s5 += res.s4; - res.s6 += res.s5; - res.s7 += res.s6; - res.s8 += res.s7; - res.s9 += res.s8; - res.sA += res.s9; - res.sB += res.sA; - res.sC += res.sB; - res.sD += res.sC; - res.sE += res.sD; - res.sF += res.sE; - prev = res.sF; - vstore16(res, 0, (__global uint *)offset(&dst, j, 0)); - } -} - -/** This function computes the vertical integral of the image. - * - * @param[in,out] src_ptr Pointer to the source image. Supported data types: U32 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] height Image height. - */ -__kernel void integral_vertical( - IMAGE_DECLARATION(src), - uint height) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - - uint8 prev = vload8(0, (__global uint *)offset(&src, 0, 0)); - for(uint j = 1; j < height; ++j) - { - barrier(CLK_GLOBAL_MEM_FENCE); - uint8 res = vload8(0, (__global uint *)offset(&src, 0, j)); - res += prev; - vstore8(res, 0, (__global uint *)offset(&src, 0, j)); - prev = res; - } -} diff --git a/src/core/CL/cl_kernels/magnitude_phase.cl b/src/core/CL/cl_kernels/magnitude_phase.cl deleted file mode 100644 index 48197d6473..0000000000 --- a/src/core/CL/cl_kernels/magnitude_phase.cl +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** Calculates L1 normalization between two inputs. - * - * @param[in] a First input. Supported data types: S16, S32 - * @param[in] b Second input. Supported data types: S16, S32 - * - * @return L1 normalization magnitude result. Supported data types: S16, S32 - */ -inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l1(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b) -{ - return CONVERT_SAT(add_sat(abs(a), abs(b)), VEC_DATA_TYPE(DATA_TYPE, 16)); -} - -/** Calculates L2 normalization between two inputs. - * - * @param[in] a First input. Supported data types: S16, S32 - * @param[in] b Second input. Supported data types: S16, S32 - * - * @return L2 normalization magnitude result. Supported data types: S16, S32 - */ -inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l2(int16 a, int16 b) -{ - return CONVERT_SAT((sqrt(convert_float16((convert_uint16(a * a) + convert_uint16(b * b)))) + 0.5f), - VEC_DATA_TYPE(DATA_TYPE, 16)); -} - -/** Calculates unsigned phase between two inputs. - * - * @param[in] a First input. Supported data types: S16, S32 - * @param[in] b Second input. Supported data types: S16, S32 - * - * @return Unsigned phase mapped in the interval [0, 180]. Supported data types: U8 - */ -inline uchar16 phase_unsigned(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b) -{ - float16 angle_deg_f32 = atan2pi(convert_float16(b), convert_float16(a)) * (float16)180.0f; - angle_deg_f32 = select(angle_deg_f32, (float16)180.0f + angle_deg_f32, angle_deg_f32 < (float16)0.0f); - return convert_uchar16(angle_deg_f32); -} - -/** Calculates signed phase between two inputs. - * - * @param[in] a First input. Supported data types: S16, S32 - * @param[in] b Second input. Supported data types: S16, S32 - * - * @return Signed phase mapped in the interval [0, 256). Supported data types: U8 - */ -inline uchar16 phase_signed(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b) -{ - float16 arct = atan2pi(convert_float16(b), convert_float16(a)); - arct = select(arct, arct + 2, arct < 0.0f); - - return convert_uchar16(convert_int16(mad(arct, 128, 0.5f)) & (int16)0xFFu); -} - -#if(1 == MAGNITUDE) -#define MAGNITUDE_OP(x, y) magnitude_l1((x), (y)) -#elif(2 == MAGNITUDE) -#define MAGNITUDE_OP(x, y) magnitude_l2(convert_int16(x), convert_int16(y)) -#else /* MAGNITUDE */ -#define MAGNITUDE_OP(x, y) -#endif /* MAGNITUDE */ - -#if(1 == PHASE) -#define PHASE_OP(x, y) phase_unsigned((x), (y)) -#elif(2 == PHASE) -#define PHASE_OP(x, y) phase_signed((x), (y)) -#else /* PHASE */ -#define PHASE_OP(x, y) -#endif /* PHASE */ - -/** Calculate the magnitude and phase of given the gradients of an image. - * - * @note Magnitude calculation supported: L1 normalization(type = 1) and L2 normalization(type = 2). - * @note Phase calculation supported: Unsigned(type = 1) [0,128] and Signed(type = 2) [0,256). - * - * @attention To enable phase calculation -DPHASE="phase_calculation_type_id" must be provided at compile time. eg -DPHASE=1 - * @attention To enable magnitude calculation -DMAGNITUDE="magnitude_calculation_type_id" must be provided at compile time. eg -DMAGNITUDE=1 - * @attention Datatype of the two inputs is passed at compile time using -DDATA_TYPE. e.g -DDATA_TYPE=short. Supported data_types are: short and int - * - * @param[in] gx_ptr Pointer to the first source image (gradient X). Supported data types: S16, S32 - * @param[in] gx_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] gx_step_x gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] gx_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] gx_step_y gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] gx_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] gy_ptr Pointer to the second source image (gradient Y) . Supported data types: S16, S32 - * @param[in] gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] gy_step_x gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] gy_step_y gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] gy_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[out] magnitude_ptr Pointer to the magnitude destination image. Supported data types: S16, S32 - * @param[in] magnitude_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] magnitude_step_x magnitude_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] magnitude_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] magnitude_step_y magnitude_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] magnitude_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] phase_ptr Pointer to the phase destination image. Supported data types: U8 - * @param[in] phase_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] phase_step_x phase_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] phase_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] phase_step_y phase_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] phase_offset_first_element_in_bytes The offset of the first element in the destination image - * */ -__kernel void magnitude_phase( - IMAGE_DECLARATION(gx), - IMAGE_DECLARATION(gy) -#ifdef MAGNITUDE - , - IMAGE_DECLARATION(magnitude) -#endif /* MAGNITUDE */ -#ifdef PHASE - , - IMAGE_DECLARATION(phase) -#endif /* PHASE */ -) -{ - // Get pixels pointer - Image gx = CONVERT_TO_IMAGE_STRUCT(gx); - Image gy = CONVERT_TO_IMAGE_STRUCT(gy); - - // Load values - VEC_DATA_TYPE(DATA_TYPE, 16) - in_a = vload16(0, (__global DATA_TYPE *)gx.ptr); - VEC_DATA_TYPE(DATA_TYPE, 16) - in_b = vload16(0, (__global DATA_TYPE *)gy.ptr); - - // Calculate and store the results -#ifdef MAGNITUDE - Image magnitude = CONVERT_TO_IMAGE_STRUCT(magnitude); - vstore16(MAGNITUDE_OP(in_a, in_b), 0, (__global DATA_TYPE *)magnitude.ptr); -#endif /* MAGNITUDE */ -#ifdef PHASE - Image phase = CONVERT_TO_IMAGE_STRUCT(phase); - vstore16(PHASE_OP(in_a, in_b), 0, phase.ptr); -#endif /* PHASE */ -} diff --git a/src/core/CL/cl_kernels/mean_stddev.cl b/src/core/CL/cl_kernels/mean_stddev.cl deleted file mode 100644 index 4ddf931e4b..0000000000 --- a/src/core/CL/cl_kernels/mean_stddev.cl +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2016-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable - -/** This function calculates the sum and sum of squares of a given input image. - * - * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] height Height of the input image - * @param[out] global_sum Global sum of all elements - * @param[out] global_sum_sq Global sum of squares of all elements - */ -__kernel void mean_stddev_accumulate( - IMAGE_DECLARATION(src), - uint height, - __global ulong *global_sum -#ifdef STDDEV - , - __global ulong *global_sum_sq -#endif /* STDDEV */ -) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - - uint8 tmp_sum = 0; -#ifdef STDDEV - uint8 tmp_sum_sq = 0; -#endif /* STDDEV */ - // Calculate partial sum - for(int i = 0; i < height; i++) - { - // Load data - uint8 data = convert_uint8(vload8(0, offset(&src, 0, i))); - - tmp_sum += data; -#ifdef STDDEV - tmp_sum_sq += data * data; -#endif /* STDDEV */ - } - // Perform reduction - tmp_sum.s0123 += tmp_sum.s4567; - tmp_sum.s01 += tmp_sum.s23; - atom_add(global_sum, tmp_sum.s0 + tmp_sum.s1); - -#ifdef STDDEV - tmp_sum_sq.s0123 += tmp_sum_sq.s4567; - tmp_sum_sq.s01 += tmp_sum_sq.s23; - atom_add(global_sum_sq, tmp_sum_sq.s0 + tmp_sum_sq.s1); -#endif /* STDDEV */ -} - -#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable diff --git a/src/core/CL/cl_kernels/minmaxloc.cl b/src/core/CL/cl_kernels/minmaxloc.cl deleted file mode 100644 index 1045f22fb1..0000000000 --- a/src/core/CL/cl_kernels/minmaxloc.cl +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "types.h" - -#ifndef DATA_TYPE_MIN -#define DATA_TYPE_MIN 0x0 -#endif /* DATA_TYPE_MIN */ - -#ifndef DATA_TYPE_MAX -#define DATA_TYPE_MAX 0xFF -#endif /* DATA_TYPE_MAX */ - -inline int FloatFlip(float val) -{ - union - { - int int_val; - float flt_val; - } u_val; - u_val.flt_val = val; - return (u_val.int_val >= 0) ? u_val.int_val : u_val.int_val ^ 0x7FFFFFFF; -} - -__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MIN); -__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_max = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MAX); -__constant int16 idx16 = (int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - -/** This function identifies the min and maximum value of an input image. - * - * @note Input image data type must be passed as a preprocessor argument using -DDATA_TYPE. - * Moreover, the minimum and maximum value of the given data type must be provided using -DDATA_TYPE_MIN and -DDATA_TYPE_MAX respectively. - * @note In case image width is not a multiple of 16 then -DNON_MULTIPLE_OF_16 must be passed. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] min_max Pointer to buffer with minimum value in position 0 and maximum value in position 1 - * @param[in] width Input image width - */ -__kernel void minmax( - IMAGE_DECLARATION(src), - __global int *min_max, - int width) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - - // Initialize local minimum and local maximum - VEC_DATA_TYPE(DATA_TYPE, 16) - local_min = type_max; - VEC_DATA_TYPE(DATA_TYPE, 16) - local_max = type_min; - - // Calculate min/max of row - int i = 0; - for(; i + 16 <= width; i += 16) - { - VEC_DATA_TYPE(DATA_TYPE, 16) - data = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0)); - local_min = min(data, local_min); - local_max = max(data, local_max); - } - -#ifdef NON_MULTIPLE_OF_16 - // Handle non multiple of 16 - VEC_DATA_TYPE(DATA_TYPE, 16) - data = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0)); -#ifdef IS_DATA_TYPE_FLOAT - int16 valid_indices = (i + idx16) < width; -#else /* IS_DATA_TYPE_FLOAT */ - VEC_DATA_TYPE(DATA_TYPE, 16) - valid_indices = CONVERT((i + idx16) < width, VEC_DATA_TYPE(DATA_TYPE, 16)); -#endif /* IS_DATA_TYPE_FLOAT */ - local_max = max(local_max, select(type_min, data, valid_indices)); - local_min = min(local_min, select(type_max, data, valid_indices)); -#endif /* NON_MULTIPLE_OF_16 */ - - // Perform min/max reduction - local_min.s01234567 = min(local_min.s01234567, local_min.s89ABCDEF); - local_max.s01234567 = max(local_max.s01234567, local_max.s89ABCDEF); - - local_min.s0123 = min(local_min.s0123, local_min.s4567); - local_max.s0123 = max(local_max.s0123, local_max.s4567); - - local_min.s01 = min(local_min.s01, local_min.s23); - local_max.s01 = max(local_max.s01, local_max.s23); - - local_min.s0 = min(local_min.s0, local_min.s1); - local_max.s0 = max(local_max.s0, local_max.s1); - - // Update global min/max -#ifdef IS_DATA_TYPE_FLOAT - atomic_min(&min_max[0], FloatFlip(local_min.s0)); - atomic_max(&min_max[1], FloatFlip(local_max.s0)); -#else /* IS_DATA_TYPE_FLOAT */ - atomic_min(&min_max[0], local_min.s0); - atomic_max(&min_max[1], local_max.s0); -#endif /* IS_DATA_TYPE_FLOAT */ -} - -/** This function counts the min and max occurrences in an image and tags their position. - * - * @note -DCOUNT_MIN_MAX should be specified if we want to count the occurrences of the minimum and maximum values. - * @note -DLOCATE_MIN and/or -DLOCATE_MAX should be specified if we want to store the position of each occurrence on the given array. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] min_max Pointer to buffer with minimum value in position 0 and maximum value in position 1 - * @param[out] min_max_count Pointer to buffer with minimum value occurrences in position 0 and maximum value occurrences in position 1 - * @param[out] min_loc Array that holds the location of the minimum value occurrences - * @param[in] max_min_loc_count The maximum number of min value occurrences coordinates the array can hold - * @param[out] max_loc Array that holds the location of the maximum value occurrences - * @param[in] max_max_loc_count The maximum number of max value occurrences coordinates the array can hold - */ -__kernel void minmaxloc( - IMAGE_DECLARATION(src), - __global int *min_max, - __global uint *min_max_count -#ifdef LOCATE_MIN - , - __global Coordinates2D *min_loc, uint max_min_loc_count -#endif /* LOCATE_MIN */ -#ifdef LOCATE_MAX - , - __global Coordinates2D *max_loc, uint max_max_loc_count -#endif /* LOCATE_MAX */ -) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - -#ifdef IS_DATA_TYPE_FLOAT - __global float *min_max_ptr = (__global float *)min_max; - float min_value = min_max_ptr[0]; - float max_value = min_max_ptr[1]; -#else /* IS_DATA_TYPE_FLOAT */ - int min_value = min_max[0]; - int max_value = min_max[1]; -#endif /* IS_DATA_TYPE_FLOAT */ - - DATA_TYPE value = *((__global DATA_TYPE *)src.ptr); -#ifdef COUNT_MIN_MAX - if(value == min_value) - { - uint idx = atomic_inc(&min_max_count[0]); -#ifdef LOCATE_MIN - if(idx < max_min_loc_count) - { - min_loc[idx].x = get_global_id(0); - min_loc[idx].y = get_global_id(1); - } -#endif /* LOCATE_MIN */ - } - if(value == max_value) - { - uint idx = atomic_inc(&min_max_count[1]); -#ifdef LOCATE_MAX - if(idx < max_max_loc_count) - { - max_loc[idx].x = get_global_id(0); - max_loc[idx].y = get_global_id(1); - } -#endif /* LOCATE_MAX */ - } -#endif /* COUNT_MIN_MAX */ -} diff --git a/src/core/CL/cl_kernels/non_linear_filter3x3.cl b/src/core/CL/cl_kernels/non_linear_filter3x3.cl deleted file mode 100644 index 93c5024c52..0000000000 --- a/src/core/CL/cl_kernels/non_linear_filter3x3.cl +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Copyright (c) 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "non_linear_filter_helpers.h" - -/** This function applies a non linear filter on a 3x3 box basis on an input image. - * - * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void non_linear_filter_box3x3( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values - uchar16 top = vload16(0, offset(&src, -1, -1)); - uchar16 middle = vload16(0, offset(&src, -1, 0)); - uchar16 bottom = vload16(0, offset(&src, -1, 1)); - - // Apply respective filter -#ifdef MIN - uchar16 tmp = min(top, min(middle, bottom)); - uchar8 out = row_reduce_min_3(tmp); -#elif defined(MAX) - uchar16 tmp = max(top, max(middle, bottom)); - uchar8 out = row_reduce_max_3(tmp); -#elif defined(MEDIAN) - uchar8 p0 = top.s01234567; - uchar8 p1 = top.s12345678; - uchar8 p2 = top.s23456789; - uchar8 p3 = middle.s01234567; - uchar8 p4 = middle.s12345678; - uchar8 p5 = middle.s23456789; - uchar8 p6 = bottom.s01234567; - uchar8 p7 = bottom.s12345678; - uchar8 p8 = bottom.s23456789; - uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8); -#else /* MIN or MAX or MEDIAN */ -#error "Unsupported filter function" -#endif /* MIN or MAX or MEDIAN */ - - // Store result - vstore8(out, 0, dst.ptr); -} - -/** This function applies a non linear filter on a 3x3 cross basis on an input image. - * - * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void non_linear_filter_cross3x3( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values - uchar8 top = vload8(0, offset(&src, 0, -1)); - uchar16 middle = vload16(0, offset(&src, -1, 0)); - uchar8 bottom = vload8(0, offset(&src, 0, 1)); - - // Apply respective filter -#ifdef MIN - uchar8 tmp_middle = row_reduce_min_3(middle); - uchar8 out = min(tmp_middle, min(top, bottom)); -#elif defined(MAX) - uchar8 tmp_middle = row_reduce_max_3(middle); - uchar8 out = max(tmp_middle, max(top, bottom)); -#elif defined(MEDIAN) - uchar8 p0 = top.s01234567; - uchar8 p1 = middle.s01234567; - uchar8 p2 = middle.s12345678; - uchar8 p3 = middle.s23456789; - uchar8 p4 = bottom.s01234567; - uchar8 out = sort5(p0, p1, p2, p3, p4); -#else /* MIN or MAX or MEDIAN */ -#error "Unsupported filter function" -#endif /* MIN or MAX or MEDIAN */ - - // Store result - vstore8(out, 0, dst.ptr); -} - -/** This function applies a non linear filter on a 3x3 disk basis on an input image. - * - * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void non_linear_filter_disk3x3( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values - uchar16 top = vload16(0, offset(&src, -1, -1)); - uchar16 middle = vload16(0, offset(&src, -1, 0)); - uchar16 bottom = vload16(0, offset(&src, -1, 1)); - - // Apply respective filter -#ifdef MIN - uchar16 tmp = min(top, min(middle, bottom)); - uchar8 out = row_reduce_min_3(tmp); -#elif defined(MAX) - uchar16 tmp = max(top, max(middle, bottom)); - uchar8 out = row_reduce_max_3(tmp); -#elif defined(MEDIAN) - uchar8 p0 = top.s01234567; - uchar8 p1 = top.s12345678; - uchar8 p2 = top.s23456789; - uchar8 p3 = middle.s01234567; - uchar8 p4 = middle.s12345678; - uchar8 p5 = middle.s23456789; - uchar8 p6 = bottom.s01234567; - uchar8 p7 = bottom.s12345678; - uchar8 p8 = bottom.s23456789; - uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8); -#else /* MIN or MAX or MEDIAN */ -#error "Unsupported filter function" -#endif /* MIN or MAX or MEDIAN */ - - // Store result - vstore8(out, 0, dst.ptr); -} diff --git a/src/core/CL/cl_kernels/non_linear_filter5x5.cl b/src/core/CL/cl_kernels/non_linear_filter5x5.cl deleted file mode 100644 index 7c87284a72..0000000000 --- a/src/core/CL/cl_kernels/non_linear_filter5x5.cl +++ /dev/null @@ -1,483 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "non_linear_filter_helpers.h" - -// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html - -/** Sorting network to sort 8 disks of diameter 5 and return their median. - * - * @param[in] top2 Values of elements two rows above. - * @param[in] top Values of elements one row above. - * @param[in] middle Values of middle elements. - * @param[in] bottom Values of elements one row below. - * @param[in] bottom2 Values of elements two rows below. - * - * @return Median values for 8 elements. - */ -inline uchar8 median_disk5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2) -{ - uchar8 p0 = top2.s01234567; - uchar8 p1 = top2.s12345678; - uchar8 p2 = top2.s23456789; - uchar8 p3 = top.s01234567; - uchar8 p4 = top.s12345678; - uchar8 p5 = top.s23456789; - uchar8 p6 = top.s3456789A; - uchar8 p7 = top.s456789AB; - uchar8 p8 = middle.s01234567; - uchar8 p9 = middle.s12345678; - uchar8 p10 = middle.s23456789; - uchar8 p11 = middle.s3456789A; - uchar8 p12 = middle.s456789AB; - uchar8 p13 = bottom.s01234567; - uchar8 p14 = bottom.s12345678; - uchar8 p15 = bottom.s23456789; - uchar8 p16 = bottom.s3456789A; - uchar8 p17 = bottom.s456789AB; - uchar8 p18 = bottom2.s01234567; - uchar8 p19 = bottom2.s12345678; - uchar8 p20 = bottom2.s23456789; - - SORT(p0, p1); - SORT(p2, p3); - SORT(p4, p5); - SORT(p6, p7); - SORT(p8, p9); - SORT(p10, p11); - SORT(p12, p13); - SORT(p14, p15); - SORT(p16, p17); - SORT(p18, p19); - SORT(p0, p2); - SORT(p1, p3); - SORT(p4, p6); - SORT(p5, p7); - SORT(p8, p10); - SORT(p9, p11); - SORT(p12, p14); - SORT(p13, p15); - SORT(p16, p18); - SORT(p17, p19); - SORT(p1, p2); - SORT(p5, p6); - SORT(p0, p4); - SORT(p3, p7); - SORT(p9, p10); - SORT(p13, p14); - SORT(p8, p12); - SORT(p11, p15); - SORT(p17, p18); - SORT(p16, p20); - SORT(p1, p5); - SORT(p2, p6); - SORT(p9, p13); - SORT(p10, p14); - SORT(p0, p8); - SORT(p7, p15); - SORT(p17, p20); - SORT(p1, p4); - SORT(p3, p6); - SORT(p9, p12); - SORT(p11, p14); - SORT(p18, p20); - SORT(p0, p16); - SORT(p2, p4); - SORT(p3, p5); - SORT(p10, p12); - SORT(p11, p13); - SORT(p1, p9); - SORT(p6, p14); - SORT(p19, p20); - SORT(p3, p4); - SORT(p11, p12); - SORT(p1, p8); - SORT(p2, p10); - SORT(p5, p13); - SORT(p7, p14); - SORT(p3, p11); - SORT(p2, p8); - SORT(p4, p12); - SORT(p7, p13); - SORT(p1, p17); - SORT(p3, p10); - SORT(p5, p12); - SORT(p1, p16); - SORT(p2, p18); - SORT(p3, p9); - SORT(p6, p12); - SORT(p2, p16); - SORT(p3, p8); - SORT(p7, p12); - SORT(p5, p9); - SORT(p6, p10); - SORT(p4, p8); - SORT(p7, p11); - SORT(p3, p19); - SORT(p5, p8); - SORT(p7, p10); - SORT(p3, p18); - SORT(p4, p20); - SORT(p6, p8); - SORT(p7, p9); - SORT(p3, p17); - SORT(p5, p20); - SORT(p7, p8); - SORT(p3, p16); - SORT(p6, p20); - SORT(p5, p17); - SORT(p7, p20); - SORT(p4, p16); - SORT(p6, p18); - SORT(p5, p16); - SORT(p7, p19); - SORT(p7, p18); - SORT(p6, p16); - SORT(p7, p17); - SORT(p10, p18); - SORT(p7, p16); - SORT(p9, p17); - SORT(p8, p16); - SORT(p9, p16); - SORT(p10, p16); - - return p10; -} - -/** Sorting network to sort 8 boxes of size 5 and return their median. - * - * @param[in] top2 Values of elements two rows above. - * @param[in] top Values of elements one row above. - * @param[in] middle Values of middle elements. - * @param[in] bottom Values of elements one row below. - * @param[in] bottom2 Values of elements two rows below. - * - * @return Median values for 8 elements. - */ -inline uchar8 median_box5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2) -{ - uchar8 p0 = top2.s01234567; - uchar8 p1 = top2.s12345678; - uchar8 p2 = top2.s23456789; - uchar8 p3 = top2.s3456789A; - uchar8 p4 = top2.s456789AB; - uchar8 p5 = top.s01234567; - uchar8 p6 = top.s12345678; - uchar8 p7 = top.s23456789; - uchar8 p8 = top.s3456789A; - uchar8 p9 = top.s456789AB; - uchar8 p10 = middle.s01234567; - uchar8 p11 = middle.s12345678; - uchar8 p12 = middle.s23456789; - uchar8 p13 = middle.s3456789A; - uchar8 p14 = middle.s456789AB; - uchar8 p15 = bottom.s01234567; - uchar8 p16 = bottom.s12345678; - uchar8 p17 = bottom.s23456789; - uchar8 p18 = bottom.s3456789A; - uchar8 p19 = bottom.s456789AB; - uchar8 p20 = bottom2.s01234567; - uchar8 p21 = bottom2.s12345678; - uchar8 p22 = bottom2.s23456789; - uchar8 p23 = bottom2.s3456789A; - uchar8 p24 = bottom2.s456789AB; - - SORT(p1, p2); - SORT(p0, p1); - SORT(p1, p2); - SORT(p4, p5); - SORT(p3, p4); - SORT(p4, p5); - SORT(p0, p3); - SORT(p2, p5); - SORT(p2, p3); - SORT(p1, p4); - SORT(p1, p2); - SORT(p3, p4); - SORT(p7, p8); - SORT(p6, p7); - SORT(p7, p8); - SORT(p10, p11); - SORT(p9, p10); - SORT(p10, p11); - SORT(p6, p9); - SORT(p8, p11); - SORT(p8, p9); - SORT(p7, p10); - SORT(p7, p8); - SORT(p9, p10); - SORT(p0, p6); - SORT(p4, p10); - SORT(p4, p6); - SORT(p2, p8); - SORT(p2, p4); - SORT(p6, p8); - SORT(p1, p7); - SORT(p5, p11); - SORT(p5, p7); - SORT(p3, p9); - SORT(p3, p5); - SORT(p7, p9); - SORT(p1, p2); - SORT(p3, p4); - SORT(p5, p6); - SORT(p7, p8); - SORT(p9, p10); - SORT(p13, p14); - SORT(p12, p13); - SORT(p13, p14); - SORT(p16, p17); - SORT(p15, p16); - SORT(p16, p17); - SORT(p12, p15); - SORT(p14, p17); - SORT(p14, p15); - SORT(p13, p16); - SORT(p13, p14); - SORT(p15, p16); - SORT(p19, p20); - SORT(p18, p19); - SORT(p19, p20); - SORT(p21, p22); - SORT(p23, p24); - SORT(p21, p23); - SORT(p22, p24); - SORT(p22, p23); - SORT(p18, p21); - SORT(p20, p23); - SORT(p20, p21); - SORT(p19, p22); - SORT(p22, p24); - SORT(p19, p20); - SORT(p21, p22); - SORT(p23, p24); - SORT(p12, p18); - SORT(p16, p22); - SORT(p16, p18); - SORT(p14, p20); - SORT(p20, p24); - SORT(p14, p16); - SORT(p18, p20); - SORT(p22, p24); - SORT(p13, p19); - SORT(p17, p23); - SORT(p17, p19); - SORT(p15, p21); - SORT(p15, p17); - SORT(p19, p21); - SORT(p13, p14); - SORT(p15, p16); - SORT(p17, p18); - SORT(p19, p20); - SORT(p21, p22); - SORT(p23, p24); - SORT(p0, p12); - SORT(p8, p20); - SORT(p8, p12); - SORT(p4, p16); - SORT(p16, p24); - SORT(p12, p16); - SORT(p2, p14); - SORT(p10, p22); - SORT(p10, p14); - SORT(p6, p18); - SORT(p6, p10); - SORT(p10, p12); - SORT(p1, p13); - SORT(p9, p21); - SORT(p9, p13); - SORT(p5, p17); - SORT(p13, p17); - SORT(p3, p15); - SORT(p11, p23); - SORT(p11, p15); - SORT(p7, p19); - SORT(p7, p11); - SORT(p11, p13); - SORT(p11, p12); - return p12; -} - -/** This function applies a non linear filter on a 5x5 box basis on an input image. - * - * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void non_linear_filter_box5x5( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values - uchar16 top2 = vload16(0, offset(&src, -2, -2)); - uchar16 top = vload16(0, offset(&src, -2, -1)); - uchar16 middle = vload16(0, offset(&src, -2, 0)); - uchar16 bottom = vload16(0, offset(&src, -2, 1)); - uchar16 bottom2 = vload16(0, offset(&src, -2, 2)); - - // Apply respective filter -#ifdef MIN - uchar16 tmp = min(middle, min(min(top2, top), min(bottom, bottom2))); - uchar8 out = row_reduce_min_5(tmp); -#elif defined(MAX) - uchar16 tmp = max(middle, max(max(top2, top), max(bottom, bottom2))); - uchar8 out = row_reduce_max_5(tmp); -#elif defined(MEDIAN) - uchar8 out = median_box5x5(top2, top, middle, bottom, bottom2); -#else /* MIN or MAX or MEDIAN */ -#error "Unsupported filter function" -#endif /* MIN or MAX or MEDIAN */ - - // Store result - vstore8(out, 0, dst.ptr); -} - -/** This function applies a non linear filter on a 5x5 cross basis on an input image. - * - * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void non_linear_filter_cross5x5( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values - uchar8 top2 = vload8(0, offset(&src, 0, -2)); - uchar8 top = vload8(0, offset(&src, 0, -1)); - uchar16 middle = vload16(0, offset(&src, -2, 0)); - uchar8 bottom = vload8(0, offset(&src, 0, 1)); - uchar8 bottom2 = vload8(0, offset(&src, 0, 2)); - - // Apply respective filter -#ifdef MIN - uchar8 tmp_middle = row_reduce_min_5(middle); - uchar8 out = min(tmp_middle, min(min(top2, top), min(bottom, bottom2))); -#elif defined(MAX) - uchar8 tmp_middle = row_reduce_max_5(middle); - uchar8 out = max(tmp_middle, max(max(top2, top.s01234567), max(bottom, bottom2))); -#elif defined(MEDIAN) - uchar8 p0 = top2; - uchar8 p1 = top; - uchar8 p2 = middle.s01234567; - uchar8 p3 = middle.s12345678; - uchar8 p4 = middle.s23456789; - uchar8 p5 = middle.s3456789A; - uchar8 p6 = middle.s456789AB; - uchar8 p7 = bottom; - uchar8 p8 = bottom2; - uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8); -#else /* MIN or MAX or MEDIAN */ -#error "Unsupported filter function" -#endif /* MIN or MAX or MEDIAN */ - - // Store result - vstore8(out, 0, dst.ptr); -} - -/** This function applies a non linear filter on a 5x5 disk basis on an input image. - * - * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void non_linear_filter_disk5x5( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values - uchar16 top2 = vload16(0, offset(&src, -2, -2)); - uchar16 top = vload16(0, offset(&src, -2, -1)); - uchar16 middle = vload16(0, offset(&src, -2, 0)); - uchar16 bottom = vload16(0, offset(&src, -2, 1)); - uchar16 bottom2 = vload16(0, offset(&src, -2, 2)); - - // Shift top2 and bottom2 values - top2 = top2.s123456789ABCDEFF; - bottom2 = bottom2.s123456789ABCDEFF; - - // Apply respective filter -#ifdef MIN - uchar16 tmp_3 = min(top2, bottom2); - uchar16 tmp_5 = min(middle, min(top, bottom)); - uchar8 tmp_3_red = row_reduce_min_3(tmp_3); - uchar8 tmp_5_red = row_reduce_min_5(tmp_5); - uchar8 out = min(tmp_3_red, tmp_5_red); -#elif defined(MAX) - uchar16 tmp_3 = max(top2, bottom2); - uchar16 tmp_5 = max(middle, max(top, bottom)); - uchar8 tmp_3_red = row_reduce_max_3(tmp_3); - uchar8 tmp_5_red = row_reduce_max_5(tmp_5); - uchar8 out = max(tmp_3_red, tmp_5_red); -#elif defined(MEDIAN) - uchar8 out = median_disk5x5(top2, top, middle, bottom, bottom2); -#else /* MIN or MAX or MEDIAN */ -#error "Unsupported filter function" -#endif /* MIN or MAX or MEDIAN */ - - // Store result - vstore8(out, 0, dst.ptr); -} diff --git a/src/core/CL/cl_kernels/non_linear_filter_helpers.h b/src/core/CL/cl_kernels/non_linear_filter_helpers.h deleted file mode 100644 index 3fcfad46f5..0000000000 --- a/src/core/CL/cl_kernels/non_linear_filter_helpers.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/** Sorts element-wise two vectors. - * - * @param[in, out] a First vector - * @param[in, out] b Second vector - */ -#define SORT(a, b) \ - { \ - uchar8 min_val = min(a, b); \ - uchar8 max_val = max(a, b); \ - a = min_val; \ - b = max_val; \ - } - -// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html - -/** Sorting network to sort 5 vectors of 8 elements and return their median. - * - * @param[in] p0 First element vector - * @param[in] p1 Second element vector - * @param[in] p2 Third element vector - * @param[in] p3 Fourth element vector - * @param[in] p4 Fifth element vector - * - * @return Median values for 8 elements. - */ -inline uchar8 sort5(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4) -{ - SORT(p0, p1); - SORT(p2, p3); - SORT(p0, p2); - SORT(p1, p3); - SORT(p1, p2); - SORT(p0, p4); - SORT(p1, p4); - SORT(p2, p4); - - return p2; -} - -/** Sorting network to sort 9 vectors of 8 elements and return their median. - * - * @param[in] p0 First element vector - * @param[in] p1 Second element vector - * @param[in] p2 Third element vector - * @param[in] p3 Fourth element vector - * @param[in] p4 Fifth element vector - * @param[in] p5 Sixth element vector - * @param[in] p6 Seventh element vector - * @param[in] p7 Eigth element vector - * @param[in] p8 Ninth element vector - * - * @return Median values for 8 elements. - */ -inline uchar8 sort9(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4, uchar8 p5, uchar8 p6, uchar8 p7, uchar8 p8) -{ - SORT(p1, p2); - SORT(p4, p5); - SORT(p7, p8); - SORT(p0, p1); - SORT(p3, p4); - SORT(p6, p7); - SORT(p1, p2); - SORT(p4, p5); - SORT(p7, p8); - SORT(p0, p3); - SORT(p5, p8); - SORT(p4, p7); - SORT(p3, p6); - SORT(p1, p4); - SORT(p2, p5); - SORT(p4, p7); - SORT(p4, p2); - SORT(p6, p4); - SORT(p4, p2); - - return p4; -} - -/** Calculate the minimum of a sliding window of size 3. - * - * @param val Values to calculate the minimum values - * - * @return Minimum values of 8 elements on a sliding window of size 3. - */ -inline uchar8 row_reduce_min_3(uchar16 val) -{ - return min(val.s01234567, min(val.s12345678, val.s23456789)); -} - -/** Calculate the maximum of a sliding window of size 3. - * - * @param val Values to calculate the maximum values - * - * @return Maximum values of 8 elements on a sliding window of size 3. - */ -inline uchar8 row_reduce_max_3(uchar16 val) -{ - return max(val.s01234567, max(val.s12345678, val.s23456789)); -} - -/** Calculate the minimum of a sliding window of size 5. - * - * @param val Values to calculate the minimum values - * - * @return Minimum values of 8 elements on a sliding window of size 5. - */ -inline uchar8 row_reduce_min_5(uchar16 val) -{ - return min(val.s01234567, min(min(val.s12345678, val.s23456789), min(val.s3456789A, val.s456789AB))); -} - -/** Calculate the maximum of a sliding window of size 5. - * - * @param val Values to calculate the maximum values - * - * @return Maximum values of 8 elements on a sliding window of size 5. - */ -inline uchar8 row_reduce_max_5(uchar16 val) -{ - return max(val.s01234567, max(max(val.s12345678, val.s23456789), max(val.s3456789A, val.s456789AB))); -} diff --git a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl deleted file mode 100644 index 9bbde1a57f..0000000000 --- a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl +++ /dev/null @@ -1,521 +0,0 @@ -/* - * Copyright (c) 2017-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "types.h" - -/* - *The criteria for lost tracking is that the spatial gradient matrix has: - * - Determinant less than DETERMINANT_THR - * - or minimum eigenvalue is smaller then EIGENVALUE_THR - * - * The thresholds for the determinant and the minimum eigenvalue is - * defined by the OpenVX spec - * - * Note: Also lost tracking happens when the point tracked coordinate is outside - * the image coordinates - * - * https://www.khronos.org/registry/vx/specs/1.0/html/d0/d0c/group__group__vision__function__opticalflowpyrlk.html - */ - -/* Internal Lucas-Kanade Keypoint struct */ -typedef struct InternalKeypoint -{ - float x; /**< The x coordinate. */ - float y; /**< The y coordinate. */ - float tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */ - float dummy; /**< Dummy member for alignment. */ -} InternalKeypoint; - -/** Threshold for the determinant. Used for lost tracking criteria */ -#define DETERMINANT_THR 1.0e-07f - -/** Thresholds for minimum eigenvalue. Used for lost tracking criteria */ -#define EIGENVALUE_THR 1.0e-04f - -/** Constants used for Lucas-Kanade Algorithm */ -#define W_BITS (14) -#define FLT_SCALE (1.0f / (float)(1 << 20)) -#define D0 ((float)(1 << W_BITS)) -#define D1 (1.0f / (float)(1 << (W_BITS - 5))) - -/** Initializes the internal new points array when the level of pyramid is NOT equal to max. - * - * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid. - * @param[in,out] new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid. - * @param[in] scale Scale factor to apply for the new_point coordinates. - */ -__kernel void init_level( - __global float4 *old_points_internal, - __global float4 *new_points_internal, - const float scale) -{ - int idx = get_global_id(0); - - // Get old and new keypoints - float4 old_point = old_points_internal[idx]; - float4 new_point = new_points_internal[idx]; - - // Scale accordingly with the pyramid_scale - old_point.xy *= (float2)(2.0f); - new_point.xy *= (float2)(2.0f); - - old_points_internal[idx] = old_point; - new_points_internal[idx] = new_point; -} - -/** Initializes the internal new points array when the level of pyramid is equal to max. - * - * @param[in] old_points An array of key points that are defined at the old_images high resolution pyramid. - * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid. - * @param[out] new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid. - * @param[in] scale Scale factor to apply for the new_point coordinates. - */ -__kernel void init_level_max( - __global Keypoint *old_points, - __global InternalKeypoint *old_points_internal, - __global InternalKeypoint *new_points_internal, - const float scale) -{ - int idx = get_global_id(0); - - Keypoint old_point = old_points[idx]; - - // Get old keypoint to track - InternalKeypoint old_point_internal; - old_point_internal.x = old_point.x * scale; - old_point_internal.y = old_point.y * scale; - old_point_internal.tracking_status = 1.f; - - // Store internal keypoints - old_points_internal[idx] = old_point_internal; - new_points_internal[idx] = old_point_internal; -} - -/** Initializes the new_points array when the level of pyramid is equal to max and if use_initial_estimate = 1. - * - * @param[in] old_points An array of key points that are defined at the old_images high resolution pyramid. - * @param[in] new_points_estimates An array of estimate key points that are defined at the old_images high resolution pyramid. - * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid. - * @param[out] new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid. - * @param[in] scale Scale factor to apply for the new_point coordinates. - */ -__kernel void init_level_max_initial_estimate( - __global Keypoint *old_points, - __global Keypoint *new_points_estimates, - __global InternalKeypoint *old_points_internal, - __global InternalKeypoint *new_points_internal, - const float scale) -{ - int idx = get_global_id(0); - - Keypoint old_point = old_points[idx]; - Keypoint new_point_estimate = new_points_estimates[idx]; - InternalKeypoint old_point_internal; - InternalKeypoint new_point_internal; - - // Get old keypoint to track - old_point_internal.x = old_point.x * scale; - old_point_internal.y = old_point.y * scale; - old_point_internal.tracking_status = 1.f; - - // Get new keypoint to track - new_point_internal.x = new_point_estimate.x * scale; - new_point_internal.y = new_point_estimate.y * scale; - new_point_internal.tracking_status = new_point_estimate.tracking_status; - - // Store internal keypoints - old_points_internal[idx] = old_point_internal; - new_points_internal[idx] = new_point_internal; -} - -/** Truncates the coordinates stored in new_points array - * - * @param[in] new_points_internal An array of estimate key points that are defined at the new_images high resolution pyramid. - * @param[out] new_points An array of internal key points that are defined at the new_images high resolution pyramid. - */ -__kernel void finalize( - __global InternalKeypoint *new_points_internal, - __global Keypoint *new_points) -{ - int idx = get_global_id(0); - - // Load internal keypoint - InternalKeypoint new_point_internal = new_points_internal[idx]; - - // Calculate output point - Keypoint new_point; - new_point.x = round(new_point_internal.x); - new_point.y = round(new_point_internal.y); - new_point.strength = 0.f; - new_point.scale = 0.f; - new_point.orientation = 0.f; - new_point.tracking_status = new_point_internal.tracking_status; - new_point.error = 0.f; - - // Store new point - new_points[idx] = new_point; -} - -/** Computes A11, A12, A22, min_eig, ival, ixval and iyval at level 0th of the pyramid. These values will be used in step 1. - * - * @param[in] old_image_ptr Pointer to the input old image. Supported data types: U8 - * @param[in] old_image_stride_x Stride of the input old image in X dimension (in bytes) - * @param[in] old_image_step_x old_image_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] old_image_stride_y Stride of the input old image in Y dimension (in bytes) - * @param[in] old_image_step_y old_image_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] old_image_offset_first_element_in_bytes The offset of the first element in the input old image - * @param[in] old_scharr_gx_ptr Pointer to the input scharr x image. Supported data types: S16 - * @param[in] old_scharr_gx_stride_x Stride of the input scharr x image in X dimension (in bytes) - * @param[in] old_scharr_gx_step_x old_scharr_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] old_scharr_gx_stride_y Stride of the input scharr x image in Y dimension (in bytes) - * @param[in] old_scharr_gx_step_y old_scharr_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] old_scharr_gx_offset_first_element_in_bytes The offset of the first element in the input scharr x image - * @param[in] old_scharr_gy_ptr Pointer to the input scharr y image. Supported data types: S16 - * @param[in] old_scharr_gy_stride_x Stride of the input scharr y image in X dimension (in bytes) - * @param[in] old_scharr_gy_step_x old_scharr_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] old_scharr_gy_stride_y Stride of the input scharr y image in Y dimension (in bytes) - * @param[in] old_scharr_gy_step_y old_scharr_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] old_scharr_gy_offset_first_element_in_bytes The offset of the first element in the input scharr y image - * @param[in] old_points An array of key points. Those key points are defined at the old_images high resolution pyramid - * @param[in, out] new_points An output array of key points. Those key points are defined at the new_images high resolution pyramid - * @param[out] coeff It stores | A11 | A12 | A22 | min_eig | for each keypoint - * @param[out] iold_val It stores | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint - * @param[in] window_dimension The size of the window on which to perform the algorithm - * @param[in] window_dimension_pow2 The squared size of the window on which to perform the algorithm - * @param[in] half_window The half size of the window on which to perform the algorithm - * @param[in] border_limits It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,) - * @param[in] eig_const 1.0f / (float)(2.0f * window_dimension * window_dimension) - * @param[in] level0 It is set to 1 if level 0 of the pyramid - */ -void __kernel lktracker_stage0( - IMAGE_DECLARATION(old_image), - IMAGE_DECLARATION(old_scharr_gx), - IMAGE_DECLARATION(old_scharr_gy), - __global float4 *old_points, - __global float4 *new_points, - __global float4 *coeff, - __global short4 *iold_val, - const int window_dimension, - const int window_dimension_pow2, - const int half_window, - const float3 border_limits, - const float eig_const, - const int level0) -{ - int idx = get_global_id(0); - - Image old_image = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_image); - Image old_scharr_gx = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gx); - Image old_scharr_gy = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gy); - - // Get old keypoint - float2 old_keypoint = old_points[idx].xy - (float2)half_window; - - // Get the floor value - float2 iold_keypoint = floor(old_keypoint); - - // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point - if(any(iold_keypoint < border_limits.zz) || any(iold_keypoint >= border_limits.xy)) - { - if(level0 == 1) - { - // Invalidate tracked point as we are at level 0 - new_points[idx].s2 = 0.0f; - } - - // Not valid coordinate. It sets min_eig to 0.0f - coeff[idx].s3 = 0.0f; - - return; - } - - // Compute weight for the bilinear interpolation - float2 ab = old_keypoint - iold_keypoint; - - // Weight used for Bilinear-Interpolation on Scharr images - // w_scharr.s0 = (1.0f - ab.x) * (1.0f - ab.y) - // w_scharr.s1 = ab.x * (1.0f - ab.y) - // w_scharr.s2 = (1.0f - ab.x) * ab.y - // w_scharr.s3 = ab.x * ab.y - - float4 w_scharr; - w_scharr.s3 = ab.x * ab.y; - w_scharr.s0 = w_scharr.s3 + 1.0f - ab.x - ab.y; - w_scharr.s12 = ab - (float2)w_scharr.s3; - - // Weight used for Bilinear-Interpolation on Old and New images - // w.s0 = round(w_scharr.s0 * D0) - // w.s1 = round(w_scharr.s1 * D0) - // w.s2 = round(w_scharr.s2 * D0) - // w.s3 = w.s3 = D0 - w.s0 - w.s1 - w.s2 - - float4 w; - w = round(w_scharr * (float4)D0); - w.s3 = D0 - w.s0 - w.s1 - w.s2; // Added for matching VX implementation - - // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig - int4 iG = (int4)0; - - // Window offset - int window_offset = idx * window_dimension_pow2; - - // Compute Spatial Gradient Matrix G - for(ushort ky = 0; ky < window_dimension; ++ky) - { - int offset_y = iold_keypoint.y + ky; - for(ushort kx = 0; kx < window_dimension; ++kx) - { - int offset_x = iold_keypoint.x + kx; - float4 px; - - // Load values from old_image for computing the bilinear interpolation - px = convert_float4((uchar4)(vload2(0, offset(&old_image, offset_x, offset_y)), - vload2(0, offset(&old_image, offset_x, offset_y + 1)))); - - // old_i.s0 = ival, old_i.s1 = ixval, old_i.s2 = iyval, old_i.s3 = dummy - float4 old_i; - - // Compute bilinear interpolation (with D1 scale factor) for ival - old_i.s0 = dot(px, w) * D1; - - // Load values from old_scharr_gx for computing the bilinear interpolation - px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y)), - vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y + 1)))); - - // Compute bilinear interpolation for ixval - old_i.s1 = dot(px, w_scharr); - - // Load values from old_scharr_gy for computing the bilinear interpolation - px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y)), - vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y + 1)))); - - // Compute bilinear interpolation for iyval - old_i.s2 = dot(px, w_scharr); - - // Rounding (it could be omitted. Used just for matching the VX implementation) - int4 iold = convert_int4(round(old_i)); - - // Accumulate values in the Spatial Gradient Matrix - iG.s0 += (int)(iold.s1 * iold.s1); - iG.s1 += (int)(iold.s1 * iold.s2); - iG.s2 += (int)(iold.s2 * iold.s2); - - // Store ival, ixval and iyval - iold_val[window_offset + kx] = convert_short4(iold); - } - window_offset += window_dimension; - } - - // Scale iA11, iA12 and iA22 - float4 G = convert_float4(iG) * (float4)FLT_SCALE; - - // Compute minimum eigen value - G.s3 = (float)(G.s2 + G.s0 - sqrt(pown(G.s0 - G.s2, 2) + 4.0f * G.s1 * G.s1)) * eig_const; - - // Store A11. A11, A22 and min_eig - coeff[idx] = G; -} - -/** Computes the motion vector for a given keypoint - * - * @param[in] new_image_ptr Pointer to the input new image. Supported data types: U8 - * @param[in] new_image_stride_x Stride of the input new image in X dimension (in bytes) - * @param[in] new_image_step_x new_image_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] new_image_stride_y Stride of the input new image in Y dimension (in bytes) - * @param[in] new_image_step_y new_image_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] new_image_offset_first_element_in_bytes The offset of the first element in the input new image - * @param[in, out] new_points An output array of key points. Those key points are defined at the new_images high resolution pyramid - * @param[in] coeff The | A11 | A12 | A22 | min_eig | for each keypoint - * @param[in] iold_val The | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint - * @param[in] window_dimension The size of the window on which to perform the algorithm - * @param[in] window_dimension_pow2 The squared size of the window on which to perform the algorithm - * @param[in] half_window The half size of the window on which to perform the algorithm - * @param[in] num_iterations The maximum number of iterations - * @param[in] epsilon The value for terminating the algorithm. - * @param[in] border_limits It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,) - * @param[in] eig_const 1.0f / (float)(2.0f * window_dimension * window_dimension) - * @param[in] level0 It is set to 1 if level of pyramid = 0 - * @param[in] term_epsilon It is set to 1 if termination = TERM_CRITERIA_EPSILON - */ -void __kernel lktracker_stage1( - IMAGE_DECLARATION(new_image), - __global float4 *new_points, - __global float4 *coeff, - __global short4 *iold_val, - const int window_dimension, - const int window_dimension_pow2, - const int half_window, - const int num_iterations, - const float epsilon, - const float3 border_limits, - const float eig_const, - const int level0, - const int term_epsilon) -{ - int idx = get_global_id(0); - Image new_image = CONVERT_TO_IMAGE_STRUCT_NO_STEP(new_image); - - // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig - float4 G = coeff[idx]; - - // Determinant - float D = G.s0 * G.s2 - G.s1 * G.s1; - - // Check if it is a good point to track - if(G.s3 < EIGENVALUE_THR || D < DETERMINANT_THR) - { - if(level0 == 1) - { - // Invalidate tracked point as we are at level 0 - new_points[idx].s2 = 0; - } - - return; - } - - // Compute inverse - //D = native_recip(D); - D = 1.0 / D; - - // Get new keypoint - float2 new_keypoint = new_points[idx].xy - (float)half_window; - - // Get new point - float2 out_new_point = new_points[idx].xy; - - // Keep delta obtained in the previous iteration - float2 prev_delta = (float2)0.0f; - - int j = 0; - while(j < num_iterations) - { - // Get the floor value - float2 inew_keypoint = floor(new_keypoint); - - // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point - if(any(inew_keypoint < border_limits.zz) || any(inew_keypoint >= border_limits.xy)) - { - if(level0 == 1) - { - // Invalidate tracked point as we are at level 0 - new_points[idx].s2 = 0.0f; - } - else - { - new_points[idx].xy = out_new_point; - } - - return; - } - - // Compute weight for the bilinear interpolation - float2 ab = new_keypoint - inew_keypoint; - - // Weight used for Bilinear-Interpolation on Old and New images - // w.s0 = round((1.0f - ab.x) * (1.0f - ab.y) * D0) - // w.s1 = round(ab.x * (1.0f - ab.y) * D0) - // w.s2 = round((1.0f - ab.x) * ab.y * D0) - // w.s3 = D0 - w.s0 - w.s1 - w.s2 - - float4 w; - w.s3 = ab.x * ab.y; - w.s0 = w.s3 + 1.0f - ab.x - ab.y; - w.s12 = ab - (float2)w.s3; - w = round(w * (float4)D0); - w.s3 = D0 - w.s0 - w.s1 - w.s2; - - // Mismatch vector - int2 ib = 0; - - // Old val offset - int old_val_offset = idx * window_dimension_pow2; - - for(int ky = 0; ky < window_dimension; ++ky) - { - for(int kx = 0; kx < window_dimension; ++kx) - { - // ival, ixval and iyval have been computed in the previous stage - int4 old_ival = convert_int4(iold_val[old_val_offset]); - - // Load values from old_image for computing the bilinear interpolation - float4 px = convert_float4((uchar4)(vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky)), - vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky + 1)))); - - // Compute bilinear interpolation on new image - int jval = (int)round(dot(px, w) * D1); - - // Compute luminance difference - int diff = (int)(jval - old_ival.s0); - - // Accumulate values in mismatch vector - ib += (diff * old_ival.s12); - - // Update old val offset - old_val_offset++; - } - } - - float2 b = convert_float2(ib) * (float2)FLT_SCALE; - - // Optical Flow - float2 delta; - - delta.x = (float)((G.s1 * b.y - G.s2 * b.x) * D); - delta.y = (float)((G.s1 * b.x - G.s0 * b.y) * D); - - // Update new point coordinate - new_keypoint += delta; - - out_new_point = new_keypoint + (float2)half_window; - - if(term_epsilon == 1) - { - float mag2 = dot(delta, delta); - - if(mag2 <= epsilon) - { - new_points[idx].xy = out_new_point; - - return; - } - } - - // Check convergence analyzing the previous delta - if(j > 0 && all(fabs(delta + prev_delta) < (float2)0.01f)) - { - out_new_point -= delta * (float2)0.5f; - - new_points[idx].xy = out_new_point; - - return; - } - - // Update previous delta - prev_delta = delta; - - j++; - } - - new_points[idx].xy = out_new_point; -} diff --git a/src/core/CL/cl_kernels/scharr_filter.cl b/src/core/CL/cl_kernels/scharr_filter.cl deleted file mode 100644 index d2868b6731..0000000000 --- a/src/core/CL/cl_kernels/scharr_filter.cl +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This OpenCL kernel computes Scharr3x3. - * - * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient - * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_gx_ptr Pointer to the destination image Supported data types: S16 - * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void scharr3x3( - IMAGE_DECLARATION(src) -#ifdef GRAD_X - , - IMAGE_DECLARATION(dst_gx) -#endif /* GRAD_X */ -#ifdef GRAD_Y - , - IMAGE_DECLARATION(dst_gy) -#endif /* GRAD_Y */ -) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); -#ifdef GRAD_X - Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx); -#endif /* GRAD_X */ -#ifdef GRAD_Y - Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy); -#endif /* GRAD_Y */ - - // Output pixels -#ifdef GRAD_X - short8 gx = (short8)0; -#endif /* GRAD_X */ -#ifdef GRAD_Y - short8 gy = (short8)0; -#endif /* GRAD_Y */ - - // Row0 - uchar16 temp = vload16(0, offset(&src, -1, -1)); - short8 left = convert_short8(temp.s01234567); - short8 middle = convert_short8(temp.s12345678); - short8 right = convert_short8(temp.s23456789); -#ifdef GRAD_X - gx += left * (short8)(-3); - gx += right * (short8)(+3); -#endif /* GRAD_X */ -#ifdef GRAD_Y - gy += left * (short8)(-3); - gy += middle * (short8)(-10); - gy += right * (short8)(-3); -#endif /* GRAD_Y */ - - // Row1 - temp = vload16(0, offset(&src, -1, 0)); - left = convert_short8(temp.s01234567); - right = convert_short8(temp.s23456789); -#ifdef GRAD_X - gx += left * (short8)(-10); - gx += right * (short8)(+10); -#endif /* GRAD_X */ - - // Row2 - temp = vload16(0, offset(&src, -1, 1)); - left = convert_short8(temp.s01234567); - middle = convert_short8(temp.s12345678); - right = convert_short8(temp.s23456789); -#ifdef GRAD_X - gx += left * (short8)(-3); - gx += right * (short8)(+3); -#endif /* GRAD_X */ -#ifdef GRAD_Y - gy += left * (short8)(+3); - gy += middle * (short8)(+10); - gy += right * (short8)(+3); -#endif /* GRAD_Y */ - - // Store results -#ifdef GRAD_X - vstore8(gx, 0, ((__global short *)dst_gx.ptr)); -#endif /* GRAD_X */ -#ifdef GRAD_Y - vstore8(gy, 0, ((__global short *)dst_gy.ptr)); -#endif /* GRAD_Y */ -} diff --git a/src/core/CL/cl_kernels/tablelookup.cl b/src/core/CL/cl_kernels/tablelookup.cl deleted file mode 100644 index 0ef1648d94..0000000000 --- a/src/core/CL/cl_kernels/tablelookup.cl +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This function performs table lookup on U8 input/output images. - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] lut LUT table. Supported data types: U8 - */ -__kernel void tablelookup_U8( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst), - __global uchar *lut) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - /* Load input data */ - uchar8 data = vload8(0, src.ptr); - - /* Load lut data */ - uchar8 lut_data = (uchar8)(lut[data.s0], lut[data.s1], lut[data.s2], lut[data.s3], - lut[data.s4], lut[data.s5], lut[data.s6], lut[data.s7]); - - /* Store result */ - vstore8(lut_data, 0, dst.ptr); -} - -/** This function performs table lookup on S16 input/output images. - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * - * @param[in] src_ptr Pointer to the source image. Supported data types: S16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] lut LUT table. Supported data types: S16 - * @param[in] offset LUT offset - * @param[in] count Number of elements in the LUT - */ -__kernel void tablelookup_S16( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst), - __global short *lut, - uint offset, - uint count) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - /* Load input data */ - short8 data = vload8(0, (__global short *)src.ptr); - - /* Load output data */ - int8 out_data = convert_int8(vload8(0, (__global short *)dst.ptr)); - - /* Calculate index */ - int8 index = convert_int8(data) + (int8)(offset); - int8 cond = (index >= 0 && index < (int8)count); - index = select(0, index, cond); - - /* Load lut data */ - int8 lut_data = (int8)(lut[index.s0], lut[index.s1], lut[index.s2], lut[index.s3], - lut[index.s4], lut[index.s5], lut[index.s6], lut[index.s7]); - - /* Select output data depending on condition */ - lut_data = select(out_data, lut_data, cond); - - /* Store result */ - vstore8(convert_short8(lut_data), 0, (__global short *)dst.ptr); -} diff --git a/src/core/CL/cl_kernels/threshold.cl b/src/core/CL/cl_kernels/threshold.cl deleted file mode 100644 index ff3ac05ef4..0000000000 --- a/src/core/CL/cl_kernels/threshold.cl +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** Perform binary thresholding on an image. - * - * @param[in] in_ptr Pointer to the source image - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[out] out_ptr Pointer to the destination image - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] false_val False value - * @param[in] true_val True value - * @param[in] threshold The thresold value - */ -__kernel void threshold_binary( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const uchar false_val, - const uchar true_val, - const uchar threshold) -{ - // Get pixels pointer - Image in = CONVERT_TO_IMAGE_STRUCT(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - - // Load data - uchar16 in_data = vload16(0, in.ptr); - - // Perform binary thresholding - in_data = select((uchar16)false_val, (uchar16)true_val, in_data > (uchar16)threshold); - - // Store result - vstore16(in_data, 0, out.ptr); -} - -/** Perform range thresholding on an image. - * - * @param[in] in_ptr Pointer to the source image - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[out] out_ptr Pointer to the destination image - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] false_val False value - * @param[in] true_val True value - * @param[in] lower Lower threshold - * @param[in] upper Upper threshold - */ -__kernel void threshold_range( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const uchar false_val, - const uchar true_val, - const uchar lower, - const uchar upper) -{ - // Get pixels pointer - Image in = CONVERT_TO_IMAGE_STRUCT(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - - // Load data - uchar16 in_data = vload16(0, in.ptr); - - // Perform range thresholding - in_data = select((uchar16)true_val, (uchar16)false_val, in_data > (uchar16)upper || in_data < (uchar16)lower); - - // Store result - vstore16(in_data, 0, out.ptr); -} diff --git a/src/core/CL/cl_kernels/warp_affine.cl b/src/core/CL/cl_kernels/warp_affine.cl deleted file mode 100644 index 909b92055b..0000000000 --- a/src/core/CL/cl_kernels/warp_affine.cl +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "warp_helpers.h" - -/** Returns a vector of floats contaning the matrix coefficients. */ -inline const float8 build_affine_mtx() -{ - return (float8)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, 0, 0); -} - -/** Transforms 4 2D coordinates using the formula: - * - * x0 = M[1][1] * x + M[1][2] * y + M[1][3] - * y0 = M[2][1] * x + M[2][2] * y + M[2][3] - * - * @param[in] coord 2D coordinate to transform. - * @param[in] mtx affine matrix - * - * @return a int8 containing 4 2D transformed values. - */ -inline const float8 apply_affine_transform(const float2 coord, const float8 mtx) -{ - const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); - // transform [x,x+1,x+2,x+3] - const float4 new_x = mad(/*A*/ in_x_coords, (float4)(mtx.s0) /*B*/, mad((float4)(coord.s1), (float4)(mtx.s2), (float4)(mtx.s4))); - // transform [y,y+1,y+2,y+3] - const float4 new_y = mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s5))); - return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); -} - -/** Performs an affine transform on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8. - * - * This kernel performs an affine transform with a 2x3 Matrix M with this method of pixel coordinate translation: - * x0 = M[1][1] * x + M[1][2] * y + M[1][3] - * y0 = M[2][1] * x + M[2][2] * y + M[2][3] - * output(x,y) = input(x0,y0) - * - * @attention The matrix coefficients need to be passed at compile time:\n - * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n - * clBuildProgram( program, 0, NULL, build_options, NULL, NULL); - * - * @param[in] in_ptr Pointer to the source image. Supported data types: U8. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image - * @param[in] width Width of the destination image - * @param[in] height Height of the destination image - */ -__kernel void warp_affine_nearest_neighbour( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const int width, - const int height) -{ - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height))), 0, out.ptr); -} - -/** Performs an affine transform on an image interpolating with the BILINEAR method. Input and output are single channel U8. - * - * @attention The matrix coefficients need to be passed at compile time:\n - * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n - * clBuildProgram( program, 0, NULL, build_options, NULL, NULL); - * - * @param[in] in_ptr Pointer to the source image. Supported data types: U8. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image - * @param[in] width Width of the destination image - * @param[in] height Height of the destination image - */ -__kernel void warp_affine_bilinear( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const int width, - const int height) -{ - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - vstore4(bilinear_interpolate(&in, apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height), 0, out.ptr); -} diff --git a/src/core/CL/cl_kernels/warp_perspective.cl b/src/core/CL/cl_kernels/warp_perspective.cl deleted file mode 100644 index bed78388a4..0000000000 --- a/src/core/CL/cl_kernels/warp_perspective.cl +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "warp_helpers.h" - -/** Returns the perspective matrix */ -inline const float16 build_perspective_mtx() -{ - return (float16)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, 0, 0, 0, (float4)0); -} - -/** Transforms four 2D coordinates using the formula: - * - * x0 = M[1][1] * x + M[1][2] * y + M[1][3] - * y0 = M[2][1] * x + M[2][2] * y + M[2][3] - * z0 = M[3][1] * x + M[3][2] * y + M[3][3] - * - * (x0/z0,y0/z0) - * - * @param[in] coord 2D coordinate to transform. - * @param[in] mtx perspective matrix - * - * @return a vector float8 containing four 2D transformed values. - */ -inline const float8 apply_perspective_transform(const float2 coord, const float16 mtx) -{ - const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); - // transform [z,z+1,z+2,z+3] - const float4 z = (float4)mad(in_x_coords, (float4)(mtx.s2), mad((float4)(coord.s1), (float4)(mtx.s5), (float4)(mtx.s8))); - // NOTE: Do not multiply x&y by 1.f/Z as this will result in loss of accuracy and mismatches with VX reference implementation - // transform [x,x+1,x+2,x+3] - const float4 new_x = (float4)mad(in_x_coords, (float4)(mtx.s0), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s6))) / z; - // transform [y,y+1,y+2,y+3] - const float4 new_y = (float4)mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s4), (float4)(mtx.s7))) / z; - return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); -} - -/** Performs perspective transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8. - * - * This kernel performs perspective transform with a 3x3 Matrix M with this method of pixel coordinate translation: - * x0 = M[1][1] * x + M[1][2] * y + M[1][3] - * y0 = M[2][1] * x + M[2][2] * y + M[2][3] - * z0 = M[3][1] * x + M[3][2] * y + M[3][3] - * - * output(x,y) = input(x0/z0,y0/z0) - * - * @attention The matrix coefficients need to be passed at compile time:\n - * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n - * clBuildProgram( program, 0, NULL, build_options, NULL, NULL); - * - * @param[in] in_ptr Pointer to the source image. Supported data types: U8. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image - * @param[in] width Width of the destination image - * @param[in] height Height of the destination image - */ -__kernel void warp_perspective_nearest_neighbour( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const int width, - const int height) -{ - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height))), 0, out.ptr); -} - -/** Performs a perspective transform on an image interpolating with the BILINEAR method. Input and output are single channel U8. - * - * @attention The matrix coefficients need to be passed at compile time:\n - * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n - * clBuildProgram( program, 0, NULL, build_options, NULL, NULL); - * - * @param[in] in_ptr Pointer to the source image. Supported data types: U8. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image - * @param[in] width Width of the destination image - * @param[in] height Height of the destination image - */ -__kernel void warp_perspective_bilinear( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const int width, - const int height) -{ - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - vstore4(bilinear_interpolate(&in, apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height), 0, out.ptr); -} |