aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
diff options
context:
space:
mode:
authorMichele Di Giorgio <michele.digiorgio@arm.com>2019-10-08 09:36:26 +0100
committerMichele Di Giorgio <michele.digiorgio@arm.com>2019-10-15 10:27:18 +0000
commita046e164b96a8441b2fa14ef578f7db46a0e97da (patch)
tree9fa2b7e003342b608acd3ed627f47f9d027ef72c /src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
parent76c996f3b240eb1f60a566e5b0a5e61fe363685a (diff)
downloadComputeLibrary-a046e164b96a8441b2fa14ef578f7db46a0e97da.tar.gz
COMPMID-2600: Implement a new and generic depthwise convolution for CL QASYMM8 NHWC
The NCHW case is supported at function level by permuting the inputs/outputs to NHWC. This patch also removes CLDirectConvolutionLayerOutputStageKernel which is deprecated and some kernels which were only used in the generic case of depthwise convolution. Change-Id: I91e0f02d0a2f4a4a352e08c248e648944137fe68 Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/2056 Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl')
-rw-r--r--src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl78
1 files changed, 1 insertions, 77 deletions
diff --git a/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl b/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
index 83da76785b..5ad9afb23c 100644
--- a/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
+++ b/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -247,79 +247,3 @@ __kernel void direct_convolution_1x1_3x3_5x5_quantized(
vstore8(convert_uchar8_sat(pixels0), 0, (__global uchar *)dst.ptr);
}
#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if defined(VEC_SIZE)
-
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define CONVERT_SAT_UCHAR_STR(x, size) (convert_uchar##size##_sat((x)))
-#define CONVERT_SAT_UCHAR(x, size) CONVERT_SAT_UCHAR_STR(x, size)
-
-/** This function computes the output stage of a depthwise convolution.
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] bias_ptr (Optional) Pointer to the biases vector. Supported data types: S32
- * @param[in] bias_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
- * @param[in] output_offset Quantized offset of zero point of the output tensor data range
- * @param[in] output_multiplier Output scale multiplier
- * @param[in] output_shift Output scale divisor exponent
- */
-__kernel void output_stage_quantized(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
-#if defined(HAS_BIAS)
- VECTOR_DECLARATION(bias),
-#endif //defined(HAS_BIAS)
- int output_offset,
- int output_multiplier,
- int output_shift)
-{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-#if defined(HAS_BIAS)
- Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-#endif //defined(HAS_BIAS)
-
- // Load input
- VEC_INT vals = VLOAD(VEC_SIZE)(0, (__global int *)(src.ptr));
-
-#if defined(HAS_BIAS)
- // Load and add bias
-#if defined(NCHW)
- int bias_value = *((__global int *)(vector_offset(&bias, get_global_id(2))));
-#else // defined(NCHW)
- VEC_INT bias_value = VLOAD(VEC_SIZE)(0, ((__global int *)(vector_offset(&bias, get_global_id(0) * VEC_SIZE))));
-#endif // defined(NCHW)
-
- vals += (VEC_INT)(bias_value);
-#endif //defined(HAS_BIAS)
-
- vals = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(vals, output_multiplier, output_shift, VEC_SIZE);
- vals = vals + output_offset;
-
- // Store result in dst
- VSTORE(VEC_SIZE)
- (CONVERT_SAT_UCHAR(vals, VEC_SIZE), 0, (__global uchar *)dst.ptr);
-}
-
-#undef VEC_INT
-#undef CONVERT_SAT_UCHAR_STR
-#undef CONVERT_SAT_UCHAR
-
-#endif // defined(VEC_SIZE)