aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/activation_layer_quant.cl
diff options
context:
space:
mode:
authorGiorgio Arena <giorgio.arena@arm.com>2020-10-02 10:20:11 +0100
committerGiorgio Arena <giorgio.arena@arm.com>2020-10-07 14:28:17 +0000
commitd304adbb1c6a2f66144c9cac1104f6e3f30d255a (patch)
tree325849f9280cfb0c92900794371d1c63d70a619c /src/core/CL/cl_kernels/activation_layer_quant.cl
parent1e75adac392dd979bd1a838583ed196e311bc77a (diff)
downloadComputeLibrary-d304adbb1c6a2f66144c9cac1104f6e3f30d255a.tar.gz
COMPMID-3703 Remove OpenCL padding: CLActivationLayerKernel + create utility macro
Change-Id: I73edadc7299247e7bc51ae37c00d3709023da44a Signed-off-by: Giorgio Arena <giorgio.arena@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4073 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/activation_layer_quant.cl')
-rw-r--r--src/core/CL/cl_kernels/activation_layer_quant.cl36
1 files changed, 19 insertions, 17 deletions
diff --git a/src/core/CL/cl_kernels/activation_layer_quant.cl b/src/core/CL/cl_kernels/activation_layer_quant.cl
index 0481319428..d8f56c093a 100644
--- a/src/core/CL/cl_kernels/activation_layer_quant.cl
+++ b/src/core/CL/cl_kernels/activation_layer_quant.cl
@@ -66,34 +66,35 @@ __kernel void activation_layer_quant_f32(
#endif /* not IN_PLACE */
)
{
+ uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
// Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
#ifdef IN_PLACE
- Tensor3D output = input;
+ __global uchar *output_addr = input_addr;
#else /* IN_PLACE */
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+ __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
#endif /* IN_PLACE */
// Load data
- TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+ TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
- VEC_FLOAT data_flt = CONVERT(data, VEC_FLOAT);
+ VEC_FLOAT data_flt = CONVERT(data0, VEC_FLOAT);
#if defined(O1_VAL)
data_flt = round(data_flt - (float)O1_VAL) * ((float)S1_VAL);
#else // defined(O1_VAL)
- data_flt = round(data_flt) * ((float)S1_VAL);
+ data_flt = round(data_flt) * ((float)S1_VAL);
#endif // defined(O1_VAL)
data_flt = ACTIVATION(ACT, float, data_flt, A_VAL, B_VAL);
#if defined(O2_VAL)
- data = CONVERT_SAT(round(data_flt / ((float)S2_VAL)) + (float)O2_VAL, TYPE);
+ data0 = CONVERT_SAT(round(data_flt / ((float)S2_VAL)) + (float)O2_VAL, TYPE);
#else // defined(O2_VAL)
- data = CONVERT_SAT(round(data_flt / ((float)S2_VAL)), TYPE);
+ data0 = CONVERT_SAT(round(data_flt / ((float)S2_VAL)), TYPE);
#endif // defined(O2_VAL)
// Store result
- VSTORE(VEC_SIZE)
- (data, 0, (__global DATA_TYPE *)output.ptr);
+ STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
}
#else // defined(FLOAT_DOMAIN)
@@ -137,22 +138,23 @@ __kernel void activation_layer_quant(
#endif /* not IN_PLACE */
)
{
+ uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
// Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
#ifdef IN_PLACE
- Tensor3D output = input;
+ __global uchar *output_addr = input_addr;
#else /* IN_PLACE */
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+ __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
#endif /* IN_PLACE */
// Load data
- TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+ TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
- data = PERFORM_ACTIVATION_QUANT(ACT, data);
+ data0 = PERFORM_ACTIVATION_QUANT(ACT, data0);
// Store result
- VSTORE(VEC_SIZE)
- (data, 0, (__global DATA_TYPE *)output.ptr);
+ STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
}
#endif // defined(ACT)
#endif // defined(FLOAT_DOMAIN)