diff options
Diffstat (limited to 'src/core/CL/cl_kernels')
-rw-r--r-- | src/core/CL/cl_kernels/normalization_layer.cl | 145 |
1 files changed, 105 insertions, 40 deletions
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl index ff4dc8ec38..4569208824 100644 --- a/src/core/CL/cl_kernels/normalization_layer.cl +++ b/src/core/CL/cl_kernels/normalization_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "helpers.h" +#include "tile_helpers.h" #define MUL_OP(x, y) ((x) * (y)) #define ADD_OP(x, y) ((x) + (y)) @@ -29,9 +30,6 @@ #define POW_OP(x, y) pow((x), (y)) #define SQCVT_SAT(a) (a) -#define LOAD_OP(offset, ptr) vload4(offset, ptr) -#define STORE_OP(data, offset, ptr) vstore4(data, offset, ptr) - #if defined(NUM_SLICES) /** Apply cross-map normalization. * @@ -58,8 +56,8 @@ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor */ -__kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) +__kernel void normalization_layer_cross_map_nchw(TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) { Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); @@ -80,7 +78,7 @@ __kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input), for(int i = left_slice; i <= right_slice; i++) { VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i)); + values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i)); acc = ADD_OP(acc, MUL_OP(values, values)); } @@ -88,19 +86,84 @@ __kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input), const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) normalized = POW_OP(acc, beta_v); const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - normalized_pixel = DIV_OP(LOAD_OP(0, (__global DATA_TYPE *)in.ptr), normalized); + normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized); - STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr); + VSTORE(VEC_SIZE) + (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr); } #endif /* defined(NUM_SLICES) */ #if defined(WIDTH_SIZE) +/** Apply cross-map normalization. + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16 + * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5 + * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192 + * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void normalization_layer_cross_map_nhwc(TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + // Offset computation + const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); + + // Address computation + __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z; + __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z; + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + acc = 0; + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + coeff_v = SQCVT_SAT(COEFF); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + beta_v = SQCVT_SAT(BETA); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + kappa_v = SQCVT_SAT(KAPPA); + + const int left_slice = max((int)0, (int)x_offs - (int)RADIUS); + const int right_slice = min((int)WIDTH_SIZE - 1, (int)x_offs + (int)RADIUS); + + for(int i = left_slice; i <= right_slice; ++i) + { + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + i * sizeof(DATA_TYPE))); + acc = ADD_OP(acc, MUL_OP(values, values)); + } + + acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + normalized = POW_OP(acc, beta_v); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + x_offs * sizeof(DATA_TYPE))), normalized); + + STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0); +} + /** Apply in-map normalization when tensors are in the NCHW data layout format. * * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16 * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5 * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA + * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1 * * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) @@ -126,13 +189,13 @@ __kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input), Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0; + acc = 0; const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF); + coeff_v = SQCVT_SAT(COEFF); const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA); + beta_v = SQCVT_SAT(BETA); const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA); + kappa_v = SQCVT_SAT(KAPPA); const int current_col = get_global_id(0) << 2; const int left_pos = max(-(int)RADIUS, -3 - current_col); @@ -152,10 +215,10 @@ __kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input), { #if defined(IN_MAP_2D) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0)); + values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0)); #else /* defined(IN_MAP_2D) */ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0)); + values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0)); #endif /* defined(IN_MAP_2D) */ acc = ADD_OP(acc, MUL_OP(values, values)); } @@ -167,13 +230,14 @@ __kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input), const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) normalized = POW_OP(acc, beta_v); const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - normalized_pixel = DIV_OP(LOAD_OP(0, (__global DATA_TYPE *)in.ptr), normalized); + normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized); - STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr); + VSTORE(VEC_SIZE) + (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr); } #endif // defined(WIDTH_SIZE) -#if defined(NUM_SLICES) +#if defined(NUM_SLICES) && defined(DIM1_SIZE) /** Apply in-map normalization when tensors are in the NHWC data layout format. * * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short @@ -202,42 +266,43 @@ __kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input), __kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) { - Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + // Offset computation + const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); + const int current_cols = get_global_id(1); + const int current_rows = get_global_id(2); + + // Address computation + __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE); + __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + current_cols * output_stride_y + current_rows * output_stride_z; VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0; + acc = 0; const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF); + coeff_v = SQCVT_SAT(COEFF); const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA); + beta_v = SQCVT_SAT(BETA); const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA); + kappa_v = SQCVT_SAT(KAPPA); - const int current_cols = get_global_id(1); - const int first_col = max(-(int)RADIUS, -current_cols); - const int last_col = min((int)RADIUS, (int)get_global_size(1) - 1 - current_cols); + const int first_col = max(0, current_cols - (int)RADIUS); + const int last_col = min((int)DIM1_SIZE - 1, current_cols + (int)RADIUS); #if defined(IN_MAP_2D) - const int current_rows = get_global_id(2); - const int first_row = max(-(int)RADIUS, -current_rows); - const int last_row = min((int)RADIUS, (int)NUM_SLICES - 1 - current_rows); + const int first_row = max(0, current_rows - (int)RADIUS); + const int last_row = min((int)NUM_SLICES - 1, current_rows + (int)RADIUS); #endif /* defined(IN_MAP_2D) */ #if defined(IN_MAP_2D) for(int j = first_row; j <= last_row; ++j) { +#else // defined(IN_MAP_2D) + const int j = current_rows; #endif /* defined(IN_MAP_2D) */ for(int i = first_col; i <= last_col; ++i) { -#if defined(IN_MAP_2D) - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, i, j)); -#else /* defined(IN_MAP_2D) */ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, i, 0)); -#endif /* defined(IN_MAP_2D) */ - acc = ADD_OP(acc, MUL_OP(values, values)); + values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + i * input_stride_y + j * input_stride_z)); + acc = ADD_OP(acc, MUL_OP(values, values)); } #if defined(IN_MAP_2D) } @@ -247,8 +312,8 @@ __kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input), const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) normalized = POW_OP(acc, beta_v); const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - normalized_pixel = DIV_OP(LOAD_OP(0, (__global DATA_TYPE *)in.ptr), normalized); + normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + current_cols * output_stride_y + current_rows * output_stride_z)), normalized); - STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr); + STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0); } -#endif /* defined(NUM_SLICES) */ +#endif // defined(NUM_SLICES) && defined(DIM1_SIZE)
\ No newline at end of file |